In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
train_data = pd.read_csv("./smile_description_train.csv")
test_data = pd.read_csv("./smile_description_test.csv")

In [3]:
# Replacing Null values of train and test data with 0 
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [4]:
# Split into X and y
X = train_data.drop("label",axis=1)
y = train_data["label"]

In [5]:
def f1_score_hist_boost(X,y,learning_rate):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(HistGradientBoostingClassifier(random_state=42,learning_rate=learning_rate), X, y, cv=cv,scoring=scoring)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [7]:
f1_score_hist_boost(X,y,0.1)

F1 score with  0.518855471222955
Accuracy score with  0.89615692685569


In [14]:
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X,y)

In [15]:
predict = model.predict(test_data)

In [23]:
def create_submission(predict,filename):
    sub_file = pd.read_csv("./data/sample_submission.csv")
    sub_file["Predicted"] = predict
    sub_file.to_csv(filename,index=False)
    print(filename," Created")

In [17]:
create_submission(predict,"submission_2_feb_18.csv")
# F1 score with  0.518855471222955
# Accuracy score with  0.89615692685569
# Learning rate 0.1

submission_1_feb_19.csv  Created


In [30]:
f1_score_hist_boost(X,y,0.5)

F1 score with  0.5790989464768236
Accuracy score with  0.8975498006492295


In [34]:
model = HistGradientBoostingClassifier(random_state=42,learning_rate=0.5)
model.fit(X,y)

In [35]:
predict = model.predict(test_data)

In [36]:
create_submission(predict,"submission_3_feb_18.csv")
# F1 score with  0.5790989464768236
# Accuracy score with  0.8975498006492295

submission_3_feb_18.csv  Created


In [38]:
f1_score_hist_boost(X,y,0.4)

F1 score with  0.5953632304384991
Accuracy score with  0.9021264557336547


In [39]:
model = HistGradientBoostingClassifier(random_state=42,learning_rate=0.4)
model.fit(X,y)
# F1 score with  0.5953632304384991
# Accuracy score with  0.9021264557336547

In [40]:
predict = model.predict(test_data)

In [41]:
create_submission(predict,"submission_4_feb_18.csv")

submission_4_feb_18.csv  Created


# Working with max_itr parameter keeping learning rate as 0.4

In [7]:
def f1_score_hist_boost(X,y,iters):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(HistGradientBoostingClassifier(random_state=42,learning_rate=0.4,max_iter=iters), X, y, cv=cv,scoring=scoring)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)
"""
With max itr 300
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
"""
"""
With max itr 400
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
"""
"""
With max itr 500
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
"""

## Working with loss parameter keeping learning rate as 0.4 and max_itr=300

In [12]:
def f1_score_hist_boost(X,y,depth):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(HistGradientBoostingClassifier(random_state=42,learning_rate=0.4,max_iter=300,max_depth=depth), X, y, cv=cv,scoring=scoring)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)
"""
Max depth =5
F1 score with  0.5842531441884814
Accuracy score with  0.9004019272219888
Max depth = 10
F1 score with  0.5968416781477343
Accuracy score with  0.9025243873700923
Max depth = 20
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
Max depth = 21
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
Max depth = 25
F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
"""

In [20]:
f1_score_hist_boost(X,y,21)

F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808


In [21]:
model = HistGradientBoostingClassifier(random_state=42,learning_rate=0.4,max_iter=300,max_depth=5)
model.fit(X,y)
predict = model.predict(test_data)

In [24]:
create_submission(predict,"submission_5_feb_18.csv")

submission_5_feb_18.csv  Created


# Trying One vs all classifier

In [25]:
from sklearn.multiclass import OneVsRestClassifier

In [26]:
def f1_score_hist_boost_one_vs_app(X,y,depth):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(OneVsRestClassifier(HistGradientBoostingClassifier(random_state=42,learning_rate=0.4,max_iter=300,max_depth=depth)), X, y, cv=cv,scoring=scoring)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [27]:
f1_score_hist_boost_one_vs_app(X,y,21) # No Improvement 

F1 score with  0.5989334155314646
Accuracy score with  0.9025774861591808
