# Supervised learning on Listmode data

In [16]:
import os 
from collections import Counter
import pandas as pd
from sklearn.metrics import classification_report
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTEENN


# The directory in which you have placed the following code
os.chdir('W:/Bureau/these/planktonPipeline/extract_Listmode')

from from_cytoclus_to_files import extract_features
from from_files_to_clusters import particle_clustering


# Where to look the data at and where to write treated data: Change with yours
data_source = 'W:/Bureau/these/donnees_oceano/old_process_FLR6'
data_destination = 'W:/Bureau/these/data'

seed = 42

In [None]:
extract_features(data_source, data_destination, flr_num = 6)

## Train a supervised RandomForest on the features

### Using classical undersampling strategy

Building the the training and testing sets (2/3 of the whole dataset) used to determine the best model by cross-validation.

In [15]:
files_titles = os.listdir(data_destination + '/features')
train_valid = pd.DataFrame()
for i in range(int(len(files_titles)*2/3)):
    df = pd.read_csv(data_destination + '/features/' + files_titles[i], sep = ',', engine = 'python')
    train_valid = train_valid.append(df)

train_valid.set_index(['Particle ID', 'date'], inplace = True)
train_valid = train_valid.dropna(how = 'any')

X_train_valid = train_valid.iloc[:, :-1]
y_train_valid = train_valid.iloc[:, -1]
        
rus = RandomUnderSampler(random_state = seed)
X_train_valid_rus, y_train_valid_rus = rus.fit_sample(X_train_valid, y_train_valid)

print('Resampled dataset shape: \n %s' % dict(Counter(y_train_valid_rus)))

Resampled dataset shape: 
 {'cryptophytes': 135, 'microphytoplancton': 135, 'nanoeucaryote': 135, 'noise': 135, 'picoeucaryote': 135, 'prochlorococcus': 135, 'synechococcus': 135}


Finding the best tuning for the Random Forest

In [5]:
rf = RandomForestClassifier(random_state = 0, n_jobs = -1)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator = rf, param_grid = param_grid, cv= 5)
CV_rfc.fit(X_train_valid_rus, y_train_valid_rus)
CV_rfc.best_params_

best_rf = RandomForestClassifier(**CV_rfc.best_params_)
best_rf.fit(X_train_valid_rus, y_train_valid_rus)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Building the the validation set (1/3 of the whole dataset)

In [6]:
test = pd.DataFrame()
for i in range(int(len(files_titles)*2/3), len(files_titles)):
    df = pd.read_csv(data_destination + '/features/' + files_titles[i], sep = ',', engine = 'python')
    test = test.append(df)
    
test.set_index(['Particle ID', 'date'], inplace = True)
test = test.dropna(how = 'any')

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

In [8]:
y_pred_test = best_rf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, y_pred_test))

Accuracy: 94.61%
                    precision    recall  f1-score   support

      cryptophytes       0.21      0.91      0.34       153
microphytoplancton       0.65      1.00      0.79        52
     nanoeucaryote       0.71      0.96      0.82      1228
             noise       1.00      0.78      0.88     13013
     picoeucaryote       0.59      0.83      0.69      1745
   prochlorococcus       0.35      1.00      0.52       378
     synechococcus       0.98      0.99      0.99     48254

       avg / total       0.97      0.95      0.95     64823



### Using SMOTEEN sampling strategy

In [17]:
sm = SMOTEENN(random_state = seed)
X_train_valid_smote, y_train_valid_smote = sm.fit_sample(X_train_valid, y_train_valid)
print('Resampled dataset shape: \n %s' % dict(Counter(y_train_valid_smote)))

Resampled dataset shape: 
 {'cryptophytes': 229303, 'microphytoplancton': 229636, 'nanoeucaryote': 228579, 'noise': 211732, 'picoeucaryote': 229070, 'prochlorococcus': 190108, 'synechococcus': 223451}


In [23]:
# Keep only 7 * 5,000 observations to keep it manageable
balance_dict = {k: 5000 for k in set(y_train_valid_smote)}

rus = RandomUnderSampler(random_state = seed, ratio = balance_dict)
X_train_valid_smote, y_train_valid_smote = rus.fit_sample(X_train_valid_smote, y_train_valid_smote)

In [24]:
rf = RandomForestClassifier(random_state = 0, n_jobs = -1)

CV_rfc = GridSearchCV(estimator = rf, param_grid = param_grid, cv= 5)
CV_rfc.fit(X_train_valid_smote, y_train_valid_smote)
CV_rfc.best_params_

best_rf = RandomForestClassifier(**CV_rfc.best_params_)
best_rf.fit(X_train_valid_smote, y_train_valid_smote)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
y_pred_test = best_rf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, y_pred_test))

Accuracy: 97.65%
                    precision    recall  f1-score   support

      cryptophytes       0.45      0.92      0.60       153
microphytoplancton       0.75      0.94      0.84        52
     nanoeucaryote       0.78      0.97      0.87      1228
             noise       1.00      0.91      0.95     13013
     picoeucaryote       0.88      0.87      0.87      1745
   prochlorococcus       0.79      1.00      0.88       378
     synechococcus       0.99      1.00      0.99     48254

       avg / total       0.98      0.98      0.98     64823



Results seem to be better with SMOTEEN.

Note that the total average precision of 98% only stems from the fact that the test set is very imbalanced and that the model performs good on very represented classes