# Making imports

In [10]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
pd.set_option('display.max_rows', 500) # Set max output 500
pd.set_option('display.max_columns',500)
import matplotlib.pyplot as plt
np.random.seed(50) # Setting random seed for reproducable results 

In [12]:
train_data = pd.read_csv("./smile_description_train.csv")
test_data = pd.read_csv("./smile_description_test.csv")
auc_scores = pd.read_csv("./AUC_VALUES.csv")

# Defining all global function

In [4]:
def get_data_info(data):
    print(data.info(verbose=-1))

In [7]:
def get_missing_values(data):
    print(test_data.isna().sum())

In [14]:
def drop_columns_and_rows_with_no_null_values(data):
    for col in data:
        missing_values = data[col].isna().sum()
        if(missing_values!=0 and col!='label'):
            data = data.drop(col,axis=1);
    data = data.dropna()
    return data;

In [18]:
def get_count_of_ones_and_twos(predict):
    print("Number of predicted ones",np.count_nonzero(predict==1))
    print("Number of predicted twos",np.count_nonzero(predict==2))

In [56]:
def create_submission(predict,filename):
    sub_file = pd.read_csv("./data/sample_submission.csv")
    sub_file["Predicted"] = predict
    sub_file.to_csv(filename,index=False)
    print(filename," Created")

# Analysing Data

In [5]:
# Getting training data info
get_data_info(train_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75383 entries, 0 to 75382
Data columns (total 210 columns):
 #    Column                    Dtype  
---   ------                    -----  
 0    MaxEStateIndex            float64
 1    MinEStateIndex            float64
 2    MaxAbsEStateIndex         float64
 3    MinAbsEStateIndex         float64
 4    qed                       float64
 5    MolWt                     float64
 6    HeavyAtomMolWt            float64
 7    ExactMolWt                float64
 8    NumValenceElectrons       float64
 9    NumRadicalElectrons       float64
 10   MaxPartialCharge          float64
 11   MinPartialCharge          float64
 12   MaxAbsPartialCharge       float64
 13   MinAbsPartialCharge       float64
 14   FpDensityMorgan1          float64
 15   FpDensityMorgan2          float64
 16   FpDensityMorgan3          float64
 17   BCUT2D_MWHI               float64
 18   BCUT2D_MWLOW              float64
 19   BCUT2D_CHGHI              float64
 20   BCUT

In [6]:
# Getting testing data info
get_data_info(test_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10994 entries, 0 to 10993
Data columns (total 209 columns):
 #    Column                    Dtype  
---   ------                    -----  
 0    MaxEStateIndex            float64
 1    MinEStateIndex            float64
 2    MaxAbsEStateIndex         float64
 3    MinAbsEStateIndex         float64
 4    qed                       float64
 5    MolWt                     float64
 6    HeavyAtomMolWt            float64
 7    ExactMolWt                float64
 8    NumValenceElectrons       float64
 9    NumRadicalElectrons       float64
 10   MaxPartialCharge          float64
 11   MinPartialCharge          float64
 12   MaxAbsPartialCharge       float64
 13   MinAbsPartialCharge       float64
 14   FpDensityMorgan1          float64
 15   FpDensityMorgan2          float64
 16   FpDensityMorgan3          float64
 17   BCUT2D_MWHI               float64
 18   BCUT2D_MWLOW              float64
 19   BCUT2D_CHGHI              float64
 20   BCUT

In [9]:
# Getting missing values of traning data
get_missing_values(train_data)

MaxEStateIndex                0
MinEStateIndex                0
MaxAbsEStateIndex             0
MinAbsEStateIndex             0
qed                           0
MolWt                         0
HeavyAtomMolWt                0
ExactMolWt                    0
NumValenceElectrons           0
NumRadicalElectrons           0
MaxPartialCharge             48
MinPartialCharge             48
MaxAbsPartialCharge          48
MinAbsPartialCharge          48
FpDensityMorgan1              0
FpDensityMorgan2              0
FpDensityMorgan3              0
BCUT2D_MWHI                 493
BCUT2D_MWLOW                493
BCUT2D_CHGHI                493
BCUT2D_CHGLO                493
BCUT2D_LOGPHI               493
BCUT2D_LOGPLOW              493
BCUT2D_MRHI                 493
BCUT2D_MRLOW                493
BalabanJ                      0
BertzCT                       0
Chi0                          0
Chi0n                         0
Chi0v                         0
Chi1                          0
Chi1n   

In [13]:
# Printing AUC values greater than 0.7
auc_scores[auc_scores["AUC_VALUES"] > 0.7]

Unnamed: 0.1,Unnamed: 0,Columns,AUC_VALUES
0,208,label,1.0
1,209,Assay_id,0.854489
2,19,BCUT2D_CHGHI,0.79178
3,21,BCUT2D_LOGPHI,0.791692
4,23,BCUT2D_MRHI,0.791675
5,20,BCUT2D_CHGLO,0.791218
6,34,Chi2v,0.790972
7,24,BCUT2D_MRLOW,0.790303
8,22,BCUT2D_LOGPLOW,0.790149
9,36,Chi3v,0.789833


In [15]:
train_data = drop_columns_and_rows_with_no_null_values(train_data)

In [106]:
model1 = RandomForestClassifier(n_estimators=200,random_state=50)
model2 = GradientBoostingClassifier()

# Select features and use model for training

In [107]:
X = train_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"]]
y = train_data["label"]

In [108]:
model1.fit(X,y)

In [104]:
predict = model1.predict(test_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt","C]])

In [105]:
get_count_of_ones_and_twos(predict)

Number of predicted ones 1312
Number of predicted twos 9682


Submission 18 using 100 estimators Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt (0.73255)

Submission 19 using kappa3 (0.72833)


Submission 20 using Kappa1 but increasing estimators to 150 and adding "Chi3v","qed" (0.70201)

Submission 21 increasing estimators to 200 and adding "Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt (0.73411)

Submission 22 increasing estimators to 250 and adding "Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt (0.73411)

In [93]:
create_submission(predict,"submission_3_feb_5.csv")

submission_3_feb_4.csv  Created


In [94]:
X

Unnamed: 0,Assay_id,MaxAbsEStateIndex,MolWt,Kappa1,HeavyAtomMolWt
0,1644,9.316200,317.599,14.321871,306.511
1,2451,10.532611,156.269,10.670000,136.109
2,1384,2.433032,362.086,26.478759,313.702
3,16,10.355080,255.665,11.591174,245.585
4,1856,0.000000,149.894,9.288422,149.894
...,...,...,...,...,...
75378,33,11.460021,230.245,11.315529,220.165
75379,1632,5.928972,313.747,15.856180,296.611
75380,1373,4.975926,167.258,6.066958,162.218
75381,2,10.241948,128.215,8.670000,112.087


## Feature scaling using standard scalar

In [133]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([
        ('somename', StandardScaler(), ["MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"])
    ], remainder='passthrough')
scaler = StandardScaler()

In [134]:
X = train_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"]]
y = train_data["label"]

In [135]:
scaled_X_data = ct.fit_transform(X)

In [136]:
pd.DataFrame(scaled_test_data)

Unnamed: 0,0,1,2,3,4
0,-0.202886,-0.891132,-0.773340,-0.930403,1682.0
1,0.764461,1.077345,0.920223,1.118381,1656.0
2,1.496441,3.028199,2.563758,2.978078,36.0
3,0.066909,-0.618584,-0.428041,-0.552565,1850.0
4,0.897854,0.982473,1.163326,0.856346,30.0
...,...,...,...,...,...
10989,0.893030,0.907345,1.008143,0.925260,38.0
10990,0.473451,0.258819,0.130786,0.270994,34.0
10991,0.741731,0.708394,0.607431,0.748440,1640.0
10992,0.272781,-0.473220,-0.473379,-0.423972,28.0


In [137]:
scaled_test_data = ct.fit_transform(test_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"]])

In [138]:
model1.fit(scaled_trained_data,y)

In [139]:
predict = model1.predict(scaled_test_data)

In [141]:
get_count_of_ones_and_twos(predict)

Number of predicted ones 943
Number of predicted twos 10051


In [140]:
create_submission(predict,"submission_3_feb_5.csv")
# Scaling data has reduced accuracy to 0.66949 submission 22

submission_3_feb_5.csv  Created


# Working with Random forest parameter tuning

- Submission 23
    - Number of estimators increases to 300 submission 22 accuracy 73.411%
- Submission 24
- Submission 25
- Submission 26
- Submission 27

In [235]:
X = train_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"]]
y = train_data["label"]

In [244]:
model = RandomForestClassifier(n_estimators=300,random_state=50)

In [245]:
model.fit(X, y)

In [246]:
predict = model.predict(test_data[["Assay_id","MaxAbsEStateIndex","MolWt","Kappa1","HeavyAtomMolWt"]])

In [247]:
get_count_of_ones_and_twos(predict)

Number of predicted ones 1312
Number of predicted twos 9682


In [248]:
create_submission(predict,"submission_3_feb_6.csv")
# Number of estimators increases to 300 submission 22 accuracy 73.411%

submission_3_feb_6.csv  Created


# Try other features and take a break 