In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron

# pandas data frame
train_df = pd.read_csv("ML-A5-2022_train.csv", index_col=0)
test_df = pd.read_csv("ML-A5-2022_test.csv", index_col=0)

# Function K-Fold validation
def KF_validation(X, Y, model, K):
    KF = KFold(n_splits=K, random_state=8, shuffle=True)
    SUM = 0
    for train_index, test_index in KF.split(X):
        X_train, X_test = X.iloc[train_index][:], X.iloc[test_index][:]
        Y_train, Y_test = Y.iloc[train_index][:], Y.iloc[test_index][:]
        bcr = BCR_score(Y_test, model.fit(X_train, Y_train).predict(X_test)) # sample_weight=weight_compute(Y_train)
        SUM += bcr
    return SUM/K

# Same as  balanced_accuracy_score(y_true, y_pred), Compute the BCR
def BCR_score(y_true, y_pred):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(y_true)):
        if (y_true[i] == -1):
            if (y_pred[i] == -1):
                TP += 1
            else:
                FN += 1
        else:
            if (y_pred[i] == -1):
                FP += 1
            else:
                TN += 1
    recall1 = (TP/(TP+FN) if TP+FN != 0 else 0)
    recall2 = (TN/(FP+TN) if FP+TN != 0 else 0)
    return (1/2)*(recall1 + recall2)

In [3]:
# Preprocessing
# Only float (except label={-1, 1}): Didn't succeed to represent the intensity low / medium /... during
# Delete all NaN (replace by 0.0 or 3.0 in OldString columns)
label_encoder = preprocessing.LabelEncoder().fit(["low", "medium", "high", "NaN"])
for feature in train_df.columns:
    if train_df[feature].dtypes == "object":
        train_df[feature] = label_encoder.transform(train_df[feature])
        train_df[feature] = train_df[feature].astype("float")
        test_df[feature] = label_encoder.transform(test_df[feature])
        test_df[feature] = test_df[feature].astype("float")
train_df = train_df.fillna(0.0)
test_df = test_df.fillna(0.0)

# Drop feature / columns that are not enough correlated with label
cor = train_df.corr() # 31m de compilation
bad_feature = []
cor_thresh = 0.1
for i in range(len(cor["label"])):
    if np.abs(cor["label"][i]) < cor_thresh:
        bad_feature.append(cor["label"].index[i]) 
train_df = train_df.drop(bad_feature, axis=1)
test_df = test_df.drop(bad_feature, axis=1)

# Outlier removal: Doesn't work, delete all entries because too much feature
# new_train_df = train_df[(np.abs(scipy.stats.zscore(train_df)) < 3).all(axis=1)]

# Weight / Data augmenting: Done inside the model

# Division of the dataset in X & Y
Y_train = train_df.loc[:]["label"]
X_train = train_df.drop("label", axis=1)

Unnamed: 0,A3GALT2,A4GNT,AADAC,AADACL2,AADAT,AARSP1,ABCA9-AS1,ABCB10P3,ABCB10P4,ABCB4,...,ZNRF3-AS1,ZP2,ZP4,ZPLD1,ZSCAN5C,ZSWIM2,ZSWIM5P1,ZSWIM5P2,ZSWIM5P3,label
C-1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,-1
C-2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,-1
C-3,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,-1
C-4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,-1
C-5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1


In [39]:
# All Result of different model after their optimization

# Gradient Boosting Classifier or Bootsing #O.65 # 0.61 avec 0.1 de threshold
# print("GBC:", KF_validation(X_train, Y_train, GradientBoostingClassifier(random_state=0, loss='deviance', learning_rate=10, n_estimators=500, max_depth=22, max_features=None), 10))

# Random Forest: Recherche de 150m et ça a rien donné de plus de 0.57 so tant pis
# print("RF:", KF_validation(X_train, Y_train, RandomForestRegressor(max_depth=2, random_state=1), 10))

# KNN
# best score : 0.6276460781515677 avec les best params : {'weights': 'uniform', 'algorithm': 'ball_tree', 'p': 2, 'leaf_size': 5, 'n_neighbors': 5}
# print("KNN:", KF_validation(X_train, Y_train, KNeighborsRegressor(n_neighbors=2), 10))
# 0.6258795345856644 de base, 0.6023951507943568 si 0.1 de threshold

# RidgeClassifier 0.5896929196757389 {'alpha': 0.001, 'fit_intercept': True, 'normalize': True, 'class_weight': {-1: 0.2, 1: 0.8}}

# LinearRegression 0.5006329113924051
# print("Linear:", KF_validation(X_train, Y_train, LinearRegression(), 10))

# Naive Bayes: 0.810820296628652 avec 0.1 de threshold
# print("Gaussian Naive Bayes:", KF_validation(X_train, Y_train, GaussianNB(), 10))

# Perceptron
# 0.664 {'penalty': None, 'max_iter': 200, 'tol': 0.0001, 'eta0': 1.0}
# print("Perceptron:", KF_validation(X_train, Y_train, Perceptron(n_jobs=-1, random_state=0, class_weight={-1: 0.2, 1: 0.8}, penalty=None, max_iter=200, tol=0.0001, eta0=1.0), 10))

# SVM:
# 0.728 avec 0.1 de threshold
# print("SVM", KF_validation(X_train, Y_train, SVC(kernel='rbf', degree=16, C=5.0, gamma='auto', coef0=5.0, tol=0.1, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, class_weight={-1: 0.211, 1: 0.789}), 10))

# LOGISTIC REGRESSION
# print("Logistic regression:", KF_validation(X_train, Y_train, LogisticRegression(penalty='l2', tol=0.0001, C=1.0, solver='lbfgs', max_iter=20, n_jobs=-1, class_weight={-1: 0.211, 1: 0.789}), 10))
# 0.7706854543951407 avec thresold de 0.1


Logistic regression: 0.7706854543951407


In [38]:
# Computing The prediction for the BCR we will get on test set.
KF = KFold(n_splits=3, random_state=8, shuffle=True)
for train_index, test_index in KF.split(train_df):
    X_train1, X_test1 = X_train.iloc[train_index][:], X_train.iloc[test_index][:]
    Y_train1, Y_test1 = Y_train.iloc[train_index][:], Y_train.iloc[test_index][:]
model = LogisticRegression(penalty='l2', tol=0.0001, C=1.0, solver='lbfgs', max_iter=20, n_jobs=-1, class_weight={-1: 0.211, 1: 0.789})
y_pred = model.fit(X_train1, Y_train1).predict(X_test1)
pred_BCR = BCR_score(Y_test1, y_pred)
theo_BCR = KF_validation(X_train, Y_train, LogisticRegression(penalty='l2', tol=0.0001, C=1.0, solver='lbfgs', max_iter=20, n_jobs=-1, class_weight={-1: 0.211, 1: 0.789}), 10)
BCR_prediction = (pred_BCR + theo_BCR)/2
print(BCR_prediction)

0.7643896091082294


In [40]:
# Prediction on the test set
model = LogisticRegression(penalty='l2', tol=0.0001, C=1.0, solver='lbfgs', max_iter=20, n_jobs=-1, class_weight={-1: 0.211, 1: 0.789})
y_pred = model.fit(X_train, Y_train).predict(test_df)
prediction = pd.DataFrame(y_pred, index=test_df.index, columns=["Prediction"]).to_csv('y_test.csv')