In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
train_data = pd.read_csv("./smile_description_train.csv")
test_data = pd.read_csv("./smile_description_test.csv")

In [3]:
# Replacing Null values of train and test data with 0 
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [4]:
# Split into X and y
X = train_data.drop("label",axis=1)
y = train_data["label"]

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
def create_submission(predict,filename):
    sub_file = pd.read_csv("./data/sample_submission.csv")
    sub_file["Predicted"] = predict
    sub_file.to_csv(filename,index=False)
    print(filename," Created")

In [52]:
def f1_score_xg_boost(X,y,learning_rate=None):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(xgb.XGBClassifier(random_state=42,n_estimators=500,eta=learning_rate), X, y, cv=cv,scoring=scoring,verbose=3,n_jobs=-1)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [61]:
import numpy as np
def get_count_of_ones_and_twos(predict):
    print("Number of predicted ones",np.count_nonzero(predict==1))
    print("Number of predicted twos",np.count_nonzero(predict==2))

In [10]:
# f1_score_xg_boost(X,y,0.35)
"""
n_estimators = 700
learning_rate = 0.35
F1 score with  0.9483140858093458
Accuracy score with  0.9100460248164841
n_estimators = 900
learning_rate = 0.4
F1 score with  0.9472493912457081
Accuracy score with  0.9082816785922123
"""

'\nn_estimators = 700\nlearning_rate = 0.35\nF1 score with  0.9483140858093458\nAccuracy score with  0.9100460248164841\nn_estimators = 900\nlearning_rate = 0.4\nF1 score with  0.9472493912457081\nAccuracy score with  0.9082816785922123\n'

In [8]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=700,eta=0.35)

In [9]:
xgb_c.fit(X,y)

In [19]:
len(xgb_c.feature_importances_)

209

In [20]:
boost_scores = xgb_c.get_booster().get_score(importance_type='weight')
print(len(boost_scores))

186


In [30]:
scores = list(boost_scores.values())
column_names = list(boost_scores.keys())

In [33]:
column_names

['MaxEStateIndex',
 'MinEStateIndex',
 'MinAbsEStateIndex',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VSA7',
 'SMR_VSA9',
 'SlogP_VSA1',


In [44]:
booster_scres = pd.DataFrame()
booster_scres["columns"] =  column_names
booster_scres["values"] = scores
booster_scres.sort_values(by=["values"],ascending=False,inplace=True) 

In [None]:
booster_scres

In [92]:
booster_scres.head(10)
important_columns = booster_scres["columns"][0:50]

In [93]:
important_columns

185               Assay_id
3                      qed
0           MaxEStateIndex
2        MinAbsEStateIndex
1           MinEStateIndex
117                MolLogP
42                  Kappa3
23            BCUT2D_MRLOW
97             VSA_EState8
55               PEOE_VSA7
13        FpDensityMorgan1
10        MinPartialCharge
24                BalabanJ
17            BCUT2D_MWLOW
93             VSA_EState4
22             BCUT2D_MRHI
94             VSA_EState5
15        FpDensityMorgan3
21          BCUT2D_LOGPLOW
96             VSA_EState7
14        FpDensityMorgan2
56               PEOE_VSA8
19            BCUT2D_CHGLO
41                  Kappa2
83             EState_VSA4
20           BCUT2D_LOGPHI
9         MaxPartialCharge
18            BCUT2D_CHGHI
82             EState_VSA3
54               PEOE_VSA6
92             VSA_EState3
25                 BertzCT
16             BCUT2D_MWHI
11     MaxAbsPartialCharge
84             EState_VSA5
33                   Chi2v
71              SlogP_VSA2
9

In [94]:
# Feature Important train data 
train_new = train_data[important_columns]
test_new = test_data[important_columns]

In [95]:
X = train_new
y = train_data["label"]
y = label_encoder.fit_transform(y)

In [96]:
# f1_score_xg_boost(X,y,0.3)
"""
eta=0.3
n_est = 500
F1 score with  0.9478507545289233
Accuracy score with  0.9091439648453232
top -20 features
-------

"""

'\neta=0.3\nn_est = 500\nF1 score with  0.9478507545289233\nAccuracy score with  0.9091439648453232\ntop -20 features\n-------\n\n'

In [101]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=500,eta=0.35)
xgb_c.fit(X,y)

In [102]:
predict = xgb_c.predict(test_new)
predict_real = label_encoder.inverse_transform(predict)

In [103]:
get_count_of_ones_and_twos(predict_real)

Number of predicted ones 1293
Number of predicted twos 9701


In [100]:
create_submission(predict_real,"submission_3_mar_19.csv")

submission_3_mar_19.csv  Created
