In [None]:
# This file should be ran from "AnalyzeDataSet2" folder

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import neighbors

## Reading files for each cell type

In [2]:
# Reading combined CSV files
all_A_files = glob.glob(".\\A\\*.csv")
len(all_A_files)
Cell_A_sequence = list()
for file in all_A_files:
    temp_df = pd.read_csv(file)
    Cell_A_sequence.append(temp_df.values)

In [3]:
# Reading combined CSV files
all_B_files = glob.glob(".\\B\\*.csv")
len(all_B_files)
Cell_B_sequence = list()
for file in all_B_files:
    temp_df = pd.read_csv(file)
    Cell_B_sequence.append(temp_df.values)

In [4]:
# Reading combined CSV files
all_C_files = glob.glob(".\\C\\*.csv")
len(all_C_files)
Cell_C_sequence = list()
for file in all_C_files:
    temp_df = pd.read_csv(file)
    Cell_C_sequence.append(temp_df.values)

In [5]:
len_sequences = []
for one_seq in Cell_A_sequence:
    len_sequences.append(len(one_seq))
len_df_A = pd.Series(len_sequences)
len_df_A.describe()

count     1202.000000
mean       351.399334
std        673.398916
min        129.000000
25%        232.000000
50%        268.000000
75%        315.250000
max      13540.000000
dtype: float64

In [6]:
len_sequences = []
for one_seq in Cell_B_sequence:
    len_sequences.append(len(one_seq))
len_df_B = pd.Series(len_sequences)
len_df_B.describe()

count     1541.000000
mean       252.595717
std        724.538976
min         97.000000
25%        145.000000
50%        160.000000
75%        176.000000
max      16940.000000
dtype: float64

In [8]:
len_sequences = []
for one_seq in Cell_C_sequence:
    len_sequences.append(len(one_seq))
len_df_C = pd.Series(len_sequences)
len_df_C.describe()

count     1379.000000
mean       210.022480
std        768.380435
min         55.000000
25%        100.000000
50%        119.000000
75%        137.000000
max      15360.000000
dtype: float64

## Adding target(Cell Name) and combining values

In [9]:
combined_list = Cell_A_sequence
target = [1] * len(Cell_A_sequence)

In [10]:
for i in range(len(Cell_B_sequence)):
    combined_list.append(Cell_B_sequence[i])
    target.append(2)       # Assign target

In [11]:
for i in range(len(Cell_C_sequence)):
    combined_list.append(Cell_C_sequence[i])
    target.append(3)       # Assign target

In [12]:
final_combined = []
for arr in combined_list:
    final_combined.append(arr[:55])
len(final_combined)

4122

In [13]:
final_combined = np.array(final_combined)
final_combined = final_combined.reshape(final_combined.shape[0],-1)

## Splitting the value to Train and Test

In [106]:
# Performing train test split individually 
X_train, X_test, y_train, y_test = train_test_split(final_combined, target, test_size=0.30, random_state=5)

## Initial Prediction using Random Forest

In [50]:
rf_clf=RandomForestClassifier(n_estimators=20)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
random_forest_predictions = rf_clf.predict(X_test)

In [52]:
print("Random Forest Accuracy: ",metrics.accuracy_score(y_test, random_forest_predictions))

Random Forest Accuracy:  0.9442198868229588


## K Nearest Neighbors implementation

In [35]:

knn_classifier=neighbors.KNeighborsClassifier(n_neighbors=1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [36]:
knn_predictions = knn_classifier.predict(X_test)

In [37]:
print("KNN Accuracy: ",metrics.accuracy_score(y_test, knn_predictions))

KNN Accuracy:  0.7219078415521423


In [55]:
final_rf_clf = RandomForestClassifier(n_estimators=20)
final_rf_clf.fit(final_combined, target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Prediction for cells in Type 1 folder

In [82]:
all_Type1_files = glob.glob(".\\Type1\\*.csv")
Type1_file_names = []
Type_1_sequence = list()
for file in all_Type1_files:
    temp_df = pd.read_csv(file)
    Type_1_sequence.append(temp_df.values[:55])
    Type1_file_names.append(file)

In [83]:
Type_1_seq = np.array(Type_1_sequence)
Type_1_seq = Type_1_seq.reshape(Type_1_seq.shape[0],-1)

In [86]:
type_1_predictions = final_rf_clf.predict(Type_1_seq)

In [88]:
type_1_df = pd.DataFrame(data={"FileName":Type1_file_names ,"Prediction":type_1_predictions})

In [99]:
Pred_dict = {1:"Cell_A", 2:"Cell_B", 3:"Cell_C"}
type_1_df["Prediction"] = type_1_df["Prediction"].map(Pred_dict)

In [100]:
type_1_df["Prediction"].value_counts()

Cell_A    1797
Cell_B     489
Cell_C      28
Name: Prediction, dtype: int64

In [101]:
type_1_df.to_csv("Type_1_results.csv",index=False)

## Prediction for cells in Type 2 folder

In [91]:
all_Type2_files = glob.glob(".\\Type2\\*.csv")
Type2_file_names = []
Type_2_sequence = list()
for file in all_Type2_files:
    temp_df = pd.read_csv(file)
    Type_2_sequence.append(temp_df.values[:55])
    Type2_file_names.append(file)

In [92]:
Type_2_seq = np.array(Type_2_sequence)
Type_2_seq = Type_2_seq.reshape(Type_2_seq.shape[0],-1)

In [93]:
type_2_predictions = final_rf_clf.predict(Type_2_seq)

In [94]:
type_2_df = pd.DataFrame(data={"FileName":Type2_file_names ,"Prediction":type_2_predictions})

In [102]:
type_2_df["Prediction"] = type_2_df["Prediction"].map(Pred_dict)

In [103]:
type_2_df["Prediction"].value_counts()

Cell_A    7912
Cell_B     699
Cell_C      50
Name: Prediction, dtype: int64

In [104]:
type_2_df.to_csv("Type_2_results.csv",index=False)