# Imports

In [73]:
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt

# Preprocess data

1. Read "BF_cells_on_grid.txt" (Brighfield features) and "SSC.txt" (Darkfield features) as pandas Dataframe
2. Based on the folders of each image determine the ground truth label for each image (feature vector)
    + Use only 5 different classes. G1/G2/S as class 0, Prophase as class 1, Metaphase as class 2, Anaphase as class 3 and Telophase as class 4
3. Only some of the columns are useful for classification. Only keep the useful ones.
4. Concat brightfield and darkfield features and append the ground truth as the last column.
5. After merging, delete all rows that contain any "nan" value.
6. To make sure that you didn't make any mistake, check that the final dataframe must have 1007 rows and 214 columns. 

In [74]:
bf_df = pd.read_csv("BF_cells_on_grid.csv")
ssc_df = pd.read_csv("SSC.csv")

In [75]:
labels = [3, 0, 0, 2, 1, 0, 4]
gtruth_labels = []
for i in range(bf_df.shape[0]):
  label = labels[int(bf_df["ImageNumber"][i] - 1)]
  gtruth_labels.append(label)

In [76]:
brf_exclude_cols = [0,1,3,4,7,17,19] + list(range(70,77))       # Exclude these columns from brightfield features
daf_exclude_cols = list(range(0,50)) + list(range(70,77))       # Exclude these columns from darkfield features

In [77]:
bf_df.drop(bf_df.columns[brf_exclude_cols], axis=1, inplace=True)
ssc_df.drop(ssc_df.columns[daf_exclude_cols], axis=1, inplace=True)

In [78]:
dataframe = pd.concat([bf_df, ssc_df], axis = 1)
dataframe['label'] = gtruth_labels
dataframe = dataframe.dropna()
dataframe.shape

(1007, 214)

# Fit a classifier

1. Split the dataset into train/test.
2. Fit a classifier on the train data.
    + No need to implement any complex classifier. You can use the available classifiers in sklearn such as decision trees.
3. Evaluate the model on the test set.
4. Report accuracy and plot the confusion matrix.

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [80]:
data = np.array(dataframe.drop('label', axis=1))
labels = np.array(dataframe['label'])
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2 ,random_state=42)

In [81]:
clf_model = DecisionTreeClassifier(criterion="gini", random_state=42, max_depth=8, min_samples_leaf=5)   
clf_model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=8, min_samples_leaf=5, random_state=42)

In [82]:
predicted_labels = clf_model.predict(X_test)
acc_score = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)

In [83]:
print(acc_score)
print(conf_matrix)

0.7277227722772277
[[115  23   1   0   0]
 [ 16  22   3   0   0]
 [  6   5   8   0   0]
 [  0   1   0   0   0]
 [  0   0   0   0   2]]
