In [54]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold

In [55]:
train_data = pd.read_csv("./smile_description_train.csv")
test_data = pd.read_csv("./smile_description_test.csv")

In [56]:
# Replacing Null values of train and test data with 0 
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [57]:
# Split into X and y
X = train_data.drop("label",axis=1)
y = train_data["label"]

In [103]:
# Feature selection with variance threshold
selector = VarianceThreshold(threshold=80)
features_train = selector.fit_transform(X)
X = X[X.columns[selector.get_support(indices=True)]]
test_data = test_data[test_data.columns[selector.get_support(indices=True)]]

In [59]:
X

Unnamed: 0,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,BertzCT,Ipc,Kappa3,LabuteASA,PEOE_VSA6,PEOE_VSA7,...,SlogP_VSA2,SlogP_VSA5,SlogP_VSA6,TPSA,EState_VSA1,EState_VSA7,EState_VSA8,VSA_EState1,MolMR,Assay_id
0,317.599,306.511,315.982463,100.0,498.898914,1.538606e+04,3.470908,125.716648,59.068288,35.392371,...,14.005591,17.044809,48.530937,40.46,9.710442,0.000000,0.000000,-1.533785,78.3466,1644
1,156.269,136.109,156.151415,66.0,97.045900,2.408959e+02,9.670000,70.185252,39.027845,13.344559,...,5.783245,58.793226,0.000000,17.07,0.000000,0.000000,6.923737,0.000000,48.6740,2451
2,362.086,313.702,361.347528,148.0,206.009804,8.463568e+04,20.294944,160.525934,90.897334,25.683286,...,31.667888,116.580620,0.000000,0.00,0.000000,0.000000,27.942818,1.241736,107.0624,1384
3,255.665,245.585,255.052302,90.0,447.204365,9.370081e+03,3.035300,102.236385,23.093098,11.629819,...,33.965269,5.563451,23.321982,83.66,5.032314,6.066367,15.402175,0.000000,62.0891,16
4,149.894,149.894,149.894242,8.0,2.000000,0.000000e+00,0.308035,52.925670,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0000,1856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,230.245,220.165,230.036128,82.0,452.034943,1.764694e+03,2.844935,87.086883,0.000000,24.265468,...,21.620835,0.000000,29.160952,98.49,16.116531,0.000000,4.736863,28.809354,53.5809,33
75379,313.747,296.611,313.041677,104.0,430.692393,8.130291e+03,4.172492,116.836709,0.000000,39.295889,...,27.978227,33.736790,0.000000,58.40,6.718607,27.694949,10.082660,17.649703,73.5820,1632
75380,167.258,162.218,166.986341,50.0,362.898353,3.408015e+02,0.874151,67.331859,12.132734,24.350608,...,4.983979,0.000000,24.265468,15.79,0.000000,18.199101,11.050346,2.077500,47.0097,1373
75381,128.215,112.087,128.120115,54.0,68.967887,9.931601e+01,4.191512,57.455368,26.689118,12.841643,...,6.286161,39.530761,0.000000,17.07,0.000000,0.000000,13.847474,0.000000,39.3700,2


In [60]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [61]:
def create_submission(predict,filename):
    sub_file = pd.read_csv("./data/sample_submission.csv")
    sub_file["Predicted"] = predict
    sub_file.to_csv(filename,index=False)
    print(filename," Created")

In [62]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=600)

In [63]:
def f1_score_xg_boost(X,y,learning_rate=None):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(xgb.XGBClassifier(random_state=42,n_estimators=900,eta=learning_rate), X, y, cv=cv,scoring=scoring,verbose=3,n_jobs=-1)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [64]:
import numpy as np
def get_count_of_ones_and_twos(predict):
    print("Number of predicted ones",np.count_nonzero(predict==1))
    print("Number of predicted twos",np.count_nonzero(predict==2))

In [None]:
f1_score_xg_boost(X,y,0.35)
"""
n_estimators = 700
learning_rate = 0.35
F1 score with  0.9483140858093458
Accuracy score with  0.9100460248164841
n_estimators = 900
learning_rate = 0.4
F1 score with  0.9483140858093458
Accuracy score with  0.9100460248164841
"""

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [105]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=700,eta=0.35)

In [106]:
xgb_c.fit(X,y)
predict = xgb_c.predict(test_data)
predict_real = label_encoder.inverse_transform(predict)

In [107]:
get_count_of_ones_and_twos(predict_real)

Number of predicted ones 1329
Number of predicted twos 9665
