In [None]:
import warnings
warnings.filterwarnings("ignore")

# importing the required packages
import os
import gc
import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# changing directory and verifying
%cd /content/drive/MyDrive/DISC_Files/with_cell
!ls -v

/content/drive/MyDrive/DISC_Files/with_cell
111.csv  142.csv  223.csv  263.csv  333.csv  414.csv  438.csv  463.csv	529.csv  556.csv   3311.csv
112.csv  143.csv  224.csv  264.csv  334.csv  415.csv  439.csv  464.csv	531.csv  557.csv   4110.csv
113.csv  144.csv  225.csv  311.csv  335.csv  416.csv  441.csv  465.csv	532.csv  561.csv   4111.csv
114.csv  151.csv  226.csv  312.csv  336.csv  417.csv  442.csv  466.csv	533.csv  562.csv   4210.csv
115.csv  152.csv  227.csv  313.csv  337.csv  418.csv  443.csv  467.csv	534.csv  563.csv   4211.csv
116.csv  153.csv  231.csv  314.csv  338.csv  419.csv  444.csv  511.csv	535.csv  564.csv   4310.csv
117.csv  154.csv  232.csv  315.csv  339.csv  421.csv  445.csv  512.csv	536.csv  565.csv   4311.csv
121.csv  161.csv  233.csv  316.csv  341.csv  422.csv  446.csv  513.csv	537.csv  566.csv   4410.csv
122.csv  162.csv  234.csv  317.csv  342.csv  423.csv  447.csv  514.csv	538.csv  567.csv   4411.csv
123.csv  163.csv  235.csv  318.csv  343.csv  424.csv  448.csv  51

In [None]:
# create the label_modified column
def modify_data(df):

    # make groups
    df['group'] = (df['label'] != df['label'].shift()).cumsum()

    # cummulative count for each group
    df['counter'] = df.groupby('group').cumcount() + 1

    # mark the rows with the cummulative count >= K and label = 1 as True
    df['label_modified'] = (df['label'] == 1) & (df['counter'] >= 18)

    # for each group - check if any of the row is marked as 1, if so, mark all the rows in the group as True, then convert the whole column to 1/0
    df['label_modified'] = df.groupby('group')['label_modified'].transform('any').astype(int)

    # drop intermediate columns
    df.drop(['group', 'counter'], axis=1, inplace=True)

    return df

In [None]:
# initializing empty dataframes
modeling_data = pd.DataFrame(columns=['Filename',
                                      'label',
                                      'feature_1',
                                      'feature_2',
                                      'feature_3',
                                      'feature_4',
                                      'feature_5',
                                      'feature_6',
                                      'feature_7',
                                      'feature_8',
                                      'feature_9',
                                      'feature_10',
                                      'feature_11'])

In [None]:
def create_features(df):

    # calculate intermediate columns required
    diff_405_633 = df['405'] - df['633']
    diff_405_488 = df['405'] - df['488']
    sum_405_488 = df['405'] + df['488']
    sum_405_633 = df['405'] + df['633']
    sum_488_633 = df['488'] + df['633']
    diff_488_633 = df['488'] - df['633']
    sum_all = df['405'] + df['488'] + df['633']
    diff_all = df['405'] - df['488'] - df['633']

    # calculate the feature values
    feature_1 = diff_405_633.corr(df['405'])
    feature_2 = diff_405_488.corr(df['633'])
    feature_3 = diff_405_488.corr(sum_405_488)
    feature_4 = diff_405_488.corr(sum_405_633)
    feature_5 = diff_405_488.corr(sum_488_633)
    feature_6 = diff_405_488.corr(diff_488_633)
    feature_7 = diff_405_633.corr(sum_488_633)
    feature_8 = diff_405_633.corr(sum_405_488)
    feature_9 = diff_405_633.corr(sum_405_633)
    feature_10 = diff_405_488.corr(sum_all)
    feature_11 = sum_488_633.corr(diff_all)

    # create a dictionary of features
    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7, feature_8, feature_9, feature_10, feature_11

In [None]:
index_x = 0

# read files (adjust the range as required)
for i in range(111, 5312):

  filename = f"{i}.csv"
  if os.path.exists(filename):

    # defining column names
    column_names = ['405', '488', '633', 'red', 'green', 'label']

    # define data types for each column - this makes reading faster since python does not have to infer the data type of each column (only slight improvement)
    dtypes = {'405': 'float64', '488': 'float64', '633': 'float64', 'red': 'float64', 'green': 'float64', 'label': 'int64'}

    print(f'Reading in {filename}')
    data = pd.read_csv(filename, header=None, names=column_names, dtype=dtypes)

    # shape of the data
    print(f'Shape of the data is: {data.shape}')

    # with K = 18, create modified label column
    data = modify_data(data)

    # drop the unnecessary columns
    data.drop(columns=['red', 'green', 'label'], inplace=True)

    # separating each peak region and non-peak region into different groups
    data['group'] = (data['label_modified'] != data['label_modified'].shift()).cumsum()
    df_0 = data[data['label_modified'] == 0]
    df_1 = data[data['label_modified'] == 1]
    df_0_groups = df_0.groupby('group')
    df_1_groups = df_1.groupby('group')

    # for each peak region calculate the 11 feautures and store it in the dataframe
    # and then take the previous non-peak region, divide the region into groups of size equal to size of the peak region
    # randomly pick 11 groups from the above groups, calcuate the 11 features for each group separately and store it in the dataframe
    # so by doing this - for each peak region group, 11 non-peak groups are randomly choosen
    # in some cases, there won't be enough non-peak data above the peak region, in that case, whatever is possible is taken
    for number, group_data_1 in df_1_groups:
      len_of_grp = len(group_data_1)
      print(len_of_grp)
      print('---')
      v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 = create_features(group_data_1)
      # store the values in a dataframe
      modeling_data.loc[index_x] = [i, 1, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11]
      index_x = index_x + 1

      group_data_0 = df_0_groups.get_group(number-1).reset_index()
      max_value_of_sub_group = group_data_0.shape[0] // len_of_grp
      group_data_0['sub_group'] = group_data_0.index // len_of_grp
      sample_size = min(11, max_value_of_sub_group)
      sampled_values = random.sample(range(max_value_of_sub_group + 1), sample_size)

      for value in sampled_values:
        v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 = create_features(group_data_0[group_data_0['sub_group'] == value])
        # store the values in a dataframe
        modeling_data.loc[index_x] = [i, 0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11]
        index_x = index_x + 1

    # clear memory
    del data, df_0, df_1, df_0_groups, df_1_groups
    gc.collect()

Reading in 111.csv
Shape of the data is: (5400000, 6)
23
---
22
---
18
---
19
---
18
---
Reading in 112.csv
Shape of the data is: (5400000, 6)
26
---
19
---
20
---
29
---
19
---
19
---
18
---
18
---
20
---
21
---
26
---
Reading in 113.csv
Shape of the data is: (5400000, 6)
18
---
23
---
19
---
26
---
22
---
20
---
19
---
33
---
18
---
20
---
19
---
19
---
21
---
19
---
Reading in 114.csv
Shape of the data is: (5400000, 6)
23
---
27
---
26
---
18
---
25
---
29
---
18
---
19
---
19
---
18
---
23
---
19
---
Reading in 115.csv
Shape of the data is: (5400000, 6)
19
---
18
---
18
---
20
---
24
---
19
---
18
---
19
---
19
---
20
---
Reading in 116.csv
Shape of the data is: (5400000, 6)
18
---
19
---
22
---
18
---
19
---
18
---
19
---
19
---
22
---
27
---
Reading in 117.csv
Shape of the data is: (3690000, 6)
21
---
19
---
26
---
20
---
Reading in 121.csv
Shape of the data is: (5400000, 6)
21
---
31
---
19
---
21
---
Reading in 122.csv
Shape of the data is: (5400000, 6)
20
---
21
---
19
---
18


In [None]:
# write into a file
modeling_data.to_csv('/content/drive/MyDrive/DISC_Files/data_for_modeling_v1.csv', index=False)