## Data Cleaning

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import numpy as np

#Load file
dataframe = pd.read_csv("data.csv", delimiter=";")

print(f"We got {len(dataframe)} rows and {len(dataframe.columns)} columns")

#Extract Data
features = pd.DataFrame()

dataframe.loc[dataframe['CRP    '].str.startswith('<'),'CRP    ']=0
dataframe=dataframe.iloc[:,[2,4,6,8,10,12,14,16,18,21,24,26,28,30,33,35,37,41,42,43]]
#Treat string data as NaN
pd.options.mode.chained_assignment = None

for col in dataframe:
    dataframe[col]=pd.to_numeric(dataframe[col], errors='coerce') 

#Replace NaN by mean value
imp = SimpleImputer(missing_values=pd.NA, strategy='median')
dataframe=pd.DataFrame(imp.fit_transform(dataframe))
features['GB'] = 1 / (1 + np.exp(-(dataframe.iloc[:,0] - 10)))
features['GR'] = 1 / (1 + np.exp(-(dataframe.iloc[:,1] - 10)))
features['Epi'] = 1 / (1 + np.exp(-(dataframe.iloc[:,2] - 5)))
features['Nepi'] = 1 / (1 + np.exp(-(dataframe.iloc[:,3] - 5)))
features['Levure'] = 1 / (1 + np.exp(-(dataframe.iloc[:,4] - 10)))
features['Bacterie'] = 1 / (1 + np.exp(-(dataframe.iloc[:,5] - 148)))
features['CylH'] = 1 / (1 + np.exp(-(dataframe.iloc[:,6] - 5)))
features['CylP'] = 1 / (1 + np.exp(-(dataframe.iloc[:,7] - 3)))
features['Crystaux'] = 1 / (1 + np.exp(-(dataframe.iloc[:,8] - 3)))
features['ERY'] = dataframe.iloc[:,9]
features['LEU'] = dataframe.iloc[:,10]
features['NIT'] = 1 / (1 + np.exp(-(dataframe.iloc[:,11] - 60.2)))
features['KET'] = dataframe.iloc[:,12]
features['GLU'] = dataframe.iloc[:,13]
features['PRO'] = dataframe.iloc[:,14]
features['Prote'] = dataframe.iloc[:,15]
features['PH'] = dataframe.iloc[:,16]
features['SG'] = dataframe.iloc[:,17]
features['CRP'] = dataframe.iloc[:,18]
features['target'] = dataframe.iloc[:,19]
scaledFeatures = pd.DataFrame(preprocessing.scale(features),columns = features.columns)
features

## Feature selection technique

Problem space has >10 dimensions, we fall into https://en.wikipedia.org/wiki/Curse_of_dimensionality. 

In [None]:
#Let's try to see the matrix of correlation
import seaborn as sns
		     
plt.figure(figsize=(20,12))
sns.heatmap(features.corr(),annot = True)

In [None]:
from sklearn.feature_selection import VarianceThreshold

v_Treshold = VarianceThreshold(threshold=0.01)
v_Treshold.fit(features)
v_Treshold.get_support()

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import LinearSVC
# We chose linear SVC for this reason : https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
ffs = SequentialFeatureSelector(LinearSVC(max_iter=10000), n_features_to_select=8)
ffs.fit(scaledFeatures.iloc[:, 0:19].values,features.iloc[:,19].values.ravel())
ffs.get_support(indices=True)
reducedFeatures = scaledFeatures.iloc[:, ffs.get_support(indices=True)]

In [None]:
# Bayesian model below, but less effective

# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn.naive_bayes  import GaussianNB

# ffs = SequentialFeatureSelector(GaussianNB(), n_features_to_select=2)
# ffs.fit(features.iloc[:, 0:19].values,features.iloc[:,19].values.ravel())
# ffs.get_support(indices=True)
#reducedFeatures = features.iloc[:, ffs.get_support(indices=True)]

In [None]:
#ExhaustiveFeatureSelector, but take an eternity

from mlxtend.feature_selection import ExhaustiveFeatureSelector
efs=ExhaustiveFeatureSelector(LinearSVC(max_iter=10000),min_features=1, max_features=2)
efs = efs.fit(scaledFeatures.iloc[:, 0:19], features.iloc[:,19].values.ravel())
efs.best_score_
efs.best_idx_
efs.best_feature_names_
df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.naive_bayes  import GaussianNB

reducedFeaturesForPlot = features.iloc[:, list(efs.best_idx_)]
reducedFeaturesForPlot['target'] = features.iloc[:,19]
#plot the first 2 principals components
fig = plt.figure()
ax = fig.add_subplot() 
ax.set_xlabel('LEU', fontsize = 15)
ax.set_ylabel('PH TARGET', fontsize = 15)
ax.set_title('Classification', fontsize = 20)

cond = reducedFeaturesForPlot['target'] == 0

subset_a = reducedFeaturesForPlot[cond]
subset_b = reducedFeaturesForPlot[~cond]
plt.scatter(subset_a.iloc[:,0], subset_a.iloc[:,1], s=60, c='b', label='Sterile')
plt.scatter(subset_b.iloc[:,0], subset_b.iloc[:,1], s=60, c='r', label='Infected') 
ax.legend()
ax.grid()

## Model fitting

In [None]:
#Lets try to classify anyway, using https://medium.com/thrive-in-ai/classification-algorithms-in-python-5f58a7a27b88
#https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.svm import LinearSVC

X_train, X_test, Y_train,Y_test = train_test_split(reducedFeatures, features.iloc[:,19], test_size=0.2)
lr_clf = LinearSVC(max_iter=10000).fit(X_train,Y_train.values.ravel())
lr_clf.score(X_test,Y_test)

In [None]:
from sklearn.naive_bayes  import GaussianNB

X_train, X_test, Y_train,Y_test = train_test_split(reducedFeatures, features.iloc[:,19], test_size=0.2)
lr_clf = GaussianNB().fit(X_train,Y_train.values.ravel())
lr_clf.score(X_test,Y_test)

In [None]:
# from sklearn import tree
# X_train, X_test, Y_train,Y_test = train_test_split(features.iloc[:,0:19], features.iloc[:,19], test_size=0.2)
# lr_clf = tree.DecisionTreeClassifier().fit(preprocessing.scale(X_train),Y_train.values.ravel())
# lr_clf.score(X_test,Y_test)
# plt.figure(figsize=(18,18))
# tree.plot_tree(lr_clf, feature_names=features.iloc[:,0:19].columns, fontsize=10)

## PCA for fun

In [None]:
#Let's find the most relevant dimension using PCA

#We first need to normalize the data

pca = PCA()

principalComponents = pca.fit_transform(scaledFeatures)

print (f"percentage of precision with the number of components :\n {pca.explained_variance_ratio_.cumsum()}")

#relationship from data to component
#print(pd.DataFrame(pca.components_,columns=x.columns))
principalDf = pd.DataFrame(data = principalComponents)

finalDf = pd.concat([principalDf, features.iloc[:,19]], axis = 1)

In [None]:
#plot the first 2 principals components
fig = plt.figure()
ax = fig.add_subplot() 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

cond = finalDf.iloc[:,-1:] == 0
cond = cond.squeeze()
subset_a = finalDf[cond.squeeze()]
subset_b = finalDf[~cond]
plt.scatter(subset_a.iloc[:,1], subset_a.iloc[:,2], s=60, c='b', label='Sterile')
plt.scatter(subset_b.iloc[:,1], subset_b.iloc[:,2], s=60, c='r', label='Infected') 
ax.legend()
ax.grid()