# Supervised learning on Listmode data

In [None]:
import os 
from collections import Counter
import pandas as pd
#import numpy as np 
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
#from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV


# The directory in which you have placed the following code
os.chdir('W:/Bureau/these/planktonPipeline/extract_Listmode')

from from_cytoclus_to_files import extract_features
from from_files_to_clusters import particle_clustering


# Where to look the data at and where to write treated data: Change with yours
data_source = 'W:/Bureau/these/donnees_oceano/old_process_FLR6'
data_destination = 'W:/Bureau/these/data'

seed = 42

In [None]:
extract_features(data_source, data_destination, flr_num = 6)

## Train a supervised RandomForest on the features

### Using classical undersampling

Building the the training and testing sets (2/3 of the whole dataset) used to determine the best model by cross-validation.

In [6]:
files_titles = os.listdir(data_destination + '/features')
train_valid = pd.DataFrame()
for i in range(int(len(files_titles)*2/3)):
    df = pd.read_csv(data_destination + '/features/' + files_titles[i], sep = ',', engine = 'python')
    train_valid = train_valid.append(df)

train_valid.set_index(['Particle ID', 'date'], inplace = True)
train_valid = train_valid.dropna(how = 'any')

X_train_valid = train.iloc[:, :-1]
y_train_valid = train.iloc[:, -1]
        
## Label Encoding: Turns the labels into numbers
le = LabelEncoder()
le.fit(list(set(y_train_valid)))
y_train = le.transform(y_train_valid)

rus = RandomUnderSampler(random_state = seed)
X_train_valid, y_train_valid = rus.fit_sample(X_train_valid, y_train_valid)

#sm = SMOTE(random_state = seed)
#X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape %s' % Counter(y_train_valid))

Resampled dataset shape Counter({'cryptophytes': 77, 'microphytoplancton': 77, 'nanoeucaryote': 77, 'noise': 77, 'picoeucaryote': 77, 'prochlorococcus': 77, 'synechococcus': 77})


Finding the best tuning for the Random Forest

In [3]:
rf = RandomForestClassifier(random_state = 0, n_jobs = -1)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator = rf, param_grid = param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

best_rf = RandomForestClassifier(**CV_rfc.best_params_)
best_rf.fit(X_train_valid, y_train_valid)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Building the the validation set (1/3 of the whole dataset)

In [None]:
test = pd.DataFrame()
for i in range(int(len(files_titles)*2/3), len(files_titles)):
    df = pd.read_csv(data_destination + '/features/' + files_titles[i], sep = ',', engine = 'python')
    test = test.append(df)
    
test.set_index(['Particle ID', 'date'], inplace = True)
test = test.dropna(how = 'any')

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

In [None]:
# Label Encoding: Turns the labels into numbers
y_test = le.transform(y_test)
y_pred_test = best_rf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_valid, y_pred_valid)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_valid, y_pred_valid, target_names=le.classes_))