In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA



In [88]:
# train and test data frame
train_df = pd.read_csv("ML-A5-2022_train.csv", index_col=0)
test_df = pd.read_csv("ML-A5-2022_test.csv", index_col=0)


x = train_df.drop("label", axis=1)
y = train_df["label"]

print(train_df.shape)
print(test_df.shape)

(1000, 34980)
(500, 34979)


### DATA EXPLORATION ###

In [89]:
y.head()

C-1   -1
C-2   -1
C-3   -1
C-4   -1
C-5    1
Name: label, dtype: int64

In [90]:
x.head()

Unnamed: 0,A1BG,A1CF,A2M-AS1,A2ML1,A2ML1-AS1,A2ML1-AS2,A3GALT2,A4GALT,A4GNT,AACS,...,ZSWIM8-AS1,ZSWIM9,ZUP1,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZZEF1,hsa-mir-423
C-1,0.0,0.0,low,,low,,,0.0,0.0,0.0,...,0.0,,low,0.0,0.0,0.0,0.0,0.0,medium,0.0
C-2,0.0,0.0,high,0.0,low,0.0,0.0,0.0,,0.0,...,0.0,,low,0.0,,,0.0,0.0,low,
C-3,0.0,0.0,low,0.0,low,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,low,19.0,1.0,0.0,0.0,0.0,low,0.0
C-4,0.0,,low,0.0,low,0.0,0.0,0.0,0.0,0.0,...,0.0,,low,,0.0,0.0,0.0,,low,0.0
C-5,23.0,0.0,low,0.0,low,,0.0,0.0,,0.0,...,0.0,,low,0.0,0.0,0.0,0.0,0.0,low,0.0


In [91]:
x.describe()

Unnamed: 0,A1BG,A1CF,A2ML1,A2ML1-AS2,A3GALT2,A4GALT,A4GNT,AACS,AADAC,AADACL2,...,ZSWIM7,ZSWIM8,ZSWIM8-AS1,ZSWIM9,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,hsa-mir-423
count,995.0,937.0,743.0,709.0,792.0,677.0,792.0,651.0,536.0,948.0,...,498.0,614.0,688.0,503.0,774.0,656.0,689.0,957.0,948.0,906.0
mean,0.537688,0.078975,0.034993,0.015515,0.0,0.039882,0.0,5.697389,0.0,0.0,...,8.415663,8.568404,0.031977,0.393638,0.905685,2.53811,2.865022,0.00627,3.081224,0.00883
std,4.765036,1.774148,0.820139,0.208673,0.0,0.723571,0.0,35.401812,0.0,0.0,...,34.778024,37.449098,0.298703,5.888753,7.899606,21.349046,19.365384,0.111861,26.116722,0.169267
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,120.0,53.0,22.0,4.0,0.0,17.0,0.0,518.0,0.0,0.0,...,391.0,423.0,5.0,127.0,114.0,417.0,344.0,3.0,548.0,4.0


In [92]:
print(f"Numerical features: {list(x.select_dtypes(exclude=object).columns)}")
print(f"LEN Numerical features: {len(list(x.select_dtypes(exclude=object).columns))}")

print(f"Categorical features: {list(x.select_dtypes("object").columns)}")
print(f"LEN Categorical features: {len(list(x.select_dtypes("object").columns))}")

print(f'Total number of features = {len(list(x.select_dtypes(exclude=object).columns)) + len(list(x.select_dtypes("object").columns)) }')


Numerical features: ['A1BG', 'A1CF', 'A2ML1', 'A2ML1-AS2', 'A3GALT2', 'A4GALT', 'A4GNT', 'AACS', 'AADAC', 'AADACL2', 'AADACL3', 'AADACL4', 'AADAT', 'AAK1', 'AAMP', 'AANAT', 'AARD', 'AARS2', 'AARSD1', 'AARSP1', 'AATBC', 'AATK', 'ABBA01000935.2', 'ABCA10', 'ABCA12', 'ABCA13', 'ABCA17P', 'ABCA2', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', 'ABCA7', 'ABCA8', 'ABCA9', 'ABCB10P1', 'ABCB10P3', 'ABCB10P4', 'ABCB4', 'ABCB5', 'ABCB6', 'ABCB8', 'ABCC1', 'ABCC10', 'ABCC12', 'ABCC2', 'ABCC3', 'ABCC4', 'ABCC5', 'ABCC5-AS1', 'ABCC6P1', 'ABCC6P2', 'ABCC8', 'ABCC9', 'ABCD1', 'ABCD1P2', 'ABCD1P4', 'ABCD2', 'ABCD3', 'ABCF2P1', 'ABCF3', 'ABCG2', 'ABCG4', 'ABCG5', 'ABCG8', 'ABHD1', 'ABHD11', 'ABHD11-AS1', 'ABHD12B', 'ABHD13', 'ABHD14A-ACY1', 'ABHD17A', 'ABHD17AP1', 'ABHD17AP3', 'ABHD17AP4', 'ABHD17AP6', 'ABHD17B', 'ABHD17C', 'ABHD5', 'ABHD6', 'ABHD8', 'ABI1P1', 'ABI3BP', 'ABITRAMP1', 'ABL1', 'ABLIM3', 'ABR', 'ABRA', 'ABRAXAS2', 'ABT1', 'AC000035.1', 'AC000036.1', 'AC000058.1', 'AC000061.1', 'AC000065.2', 'AC000068

In [93]:
x_int = x.select_dtypes(exclude=object)
x_object = x.select_dtypes("object")


In [94]:
object_elem = {"low"   : 0,
               "medium": 1,
               "high"  : 2}

def mapping_object(df, object_elem):

    df = df.replace(object_elem)

    return df

In [95]:
x_object = mapping_object(x_object, object_elem)
x_int = pd.DataFrame(x_int, columns=x_int.columns)

  df = df.replace(object_elem)


In [96]:
count_na = (x_int.shape[0] - x_int.count()) 
for i in range(len(count_na.index)):
    print(f"Feature: {count_na.index[i]} => has {count_na.iloc[i] } NaN values.")

Feature: A1BG => has 5 NaN values.
Feature: A1CF => has 63 NaN values.
Feature: A2ML1 => has 257 NaN values.
Feature: A2ML1-AS2 => has 291 NaN values.
Feature: A3GALT2 => has 208 NaN values.
Feature: A4GALT => has 323 NaN values.
Feature: A4GNT => has 208 NaN values.
Feature: AACS => has 349 NaN values.
Feature: AADAC => has 464 NaN values.
Feature: AADACL2 => has 52 NaN values.
Feature: AADACL3 => has 74 NaN values.
Feature: AADACL4 => has 311 NaN values.
Feature: AADAT => has 245 NaN values.
Feature: AAK1 => has 369 NaN values.
Feature: AAMP => has 51 NaN values.
Feature: AANAT => has 56 NaN values.
Feature: AARD => has 660 NaN values.
Feature: AARS2 => has 201 NaN values.
Feature: AARSD1 => has 581 NaN values.
Feature: AARSP1 => has 311 NaN values.
Feature: AATBC => has 470 NaN values.
Feature: AATK => has 298 NaN values.
Feature: ABBA01000935.2 => has 547 NaN values.
Feature: ABCA10 => has 10 NaN values.
Feature: ABCA12 => has 498 NaN values.
Feature: ABCA13 => has 311 NaN values.


In [97]:
def median_NA_values(df, columns_name):
    #replace NaN value by mean of the feature column
    my_imputer = SimpleImputer()
    df = pd.DataFrame(my_imputer.fit_transform(df), columns=columns_name)
    return df

In [98]:
x_int = median_NA_values(x_int, x_int.columns)

In [99]:
X = pd.concat([x_int.reset_index(drop=True), x_object.reset_index(drop=True)], axis=1)

In [100]:
def mean(score_list):
    return sum(score_list)/len(score_list)

In [101]:
def make_model(classifier, x, y, random_state=42, cv_fold=10):
    clf = classifier(random_state=random_state)
    cv = cross_val_score(clf, X, y, cv=cv_fold)
    m_cv = mean(cv)
    return clf, m_cv

### BASIC MODEL SELECTION ###

In [None]:

sv, m_sv = make_model(SVC, X, y)
rm, m_rm = make_model(RandomForestClassifier, X, y)
gb, m_gb = make_model(GradientBoostingClassifier, X, y)
ada, m_ada = make_model(AdaBoostClassifier, X, y)
dtc, m_dtc = make_model(DecisionTreeClassifier, X, y)

In [103]:
print(f"SVC cross-val score: {m_sv:.3f}")
print(f"RandomForestClassifier cross-val score: {m_rm:.3f}")
print(f"GradientBoostingClassifer cross-val score: {m_gb:.3f}")
print(f"AdaBoostClassifier cross-val score: {m_ada:.3f}")
print(f"DecisionTreeClassifier cross-val score: {m_dtc:.3f}")

SVC cross-val score: 0.732
RandomForestClassifier cross-val score: 0.740
GradientBoostingClassifer cross-val score: 0.840
AdaBoostClassifier cross-val score: 0.766
DecisionTreeClassifier cross-val score: 0.744


### HYPER PARAMETERS TUNING ###

In [None]:
tuned_parameters = [
    {"random_state" : [42],
     "learning_rate" : [0.1, 0.01, 0.001],
     "n_estimators" : [10, 25, 50, 100, 150, 200, 230, ],
     "max_depth": [3, 5, 10],
     "verbose": [10]},
]
scores = ["balanced_accuracy", "accuracy"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    gb = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, scoring=score, verbose=10)

    gb.fit(X, y)
    print("Best parameters set found on development set:")
    print()
    print(gb.best_params_)

In [112]:

gclf = GradientBoostingClassifier(random_state=42,learning_rate=0.01, n_estimators=1000,  verbose=10)
cv = cross_val_score(gclf, X, y, cv=5)
m_gb = mean(cv)
print(f"GradientBoostingClassifer cross-val score: {m_gb:.3f}")

      Iter       Train Loss   Remaining Time 
         1           1.1650           14.54m
         2           1.1586           13.39m
         3           1.1524           12.84m
         4           1.1463           12.41m
         5           1.1404           12.16m
         6           1.1346           11.93m
         7           1.1289           11.87m
         8           1.1230           11.89m
         9           1.1173           11.95m
        10           1.1116           12.16m
        11           1.1064           12.14m
        12           1.1010           12.28m
        13           1.0957           12.35m
        14           1.0907           12.26m
        15           1.0855           12.32m
        16           1.0806           12.37m
        17           1.0756           12.35m
        18           1.0707           12.32m
        19           1.0660           12.23m
        20           1.0613           12.13m
        21           1.0567           12.04m
        2