In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
train_data = pd.read_csv("./smile_description_train.csv")
test_data = pd.read_csv("./smile_description_test.csv")

In [3]:
# Replacing Null values of train and test data with 0 
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [4]:
# Split into X and y
X = train_data.drop("label",axis=1)
y = train_data["label"]

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
def create_submission(predict,filename):
    sub_file = pd.read_csv("./data/sample_submission.csv")
    sub_file["Predicted"] = predict
    sub_file.to_csv(filename,index=False)
    print(filename," Created")

In [7]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=600)

In [16]:
def f1_score_hist_boost(X,y,learning_rate=None):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(xgb.XGBClassifier(random_state=42,n_estimators=600), X, y, cv=cv,scoring=scoring,verbose=1,n_jobs=5)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [17]:
f1_score_hist_boost(X,y)
"""
estimators = 600
F1 score with  0.9484400592750294
Accuracy score with  0.9101786631226529
"""

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 30.7min remaining: 46.0min


F1 score with  0.9484400592750294
Accuracy score with  0.9101786631226529


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 30.7min finished


'\nlearning_rate = 0.01\nF1 score with  0.9291648737368355\nAccuracy score with  0.8697982319925487\nlearning_rate = 0.01 max depth\nF1 score with  0.9291648737368355\nAccuracy score with  0.8697982319925487\nlearning_rate = 0.1\nF1 score with  0.9291648737368355\nAccuracy score with  0.8697982319925487\nlearning_rate = 0.4\nF1 score with  0.9291648737368355\nAccuracy score with  0.8697982319925487\nestimators = 600\nF1 score with  0.9291931191638572\nAccuracy score with  0.8698380268279857\n'

In [18]:
xgb_c.fit(X,y)

In [19]:
predict = xgb_c.predict(test_data)

In [20]:
predict

array([1, 1, 1, ..., 1, 1, 0])

In [21]:
predict_real = label_encoder.inverse_transform(predict)

In [22]:
predict_real

array([2, 2, 2, ..., 2, 2, 1])

In [23]:
import numpy as np
def get_count_of_ones_and_twos(predict):
    print("Number of predicted ones",np.count_nonzero(predict==1))
    print("Number of predicted twos",np.count_nonzero(predict==2))

In [25]:
get_count_of_ones_and_twos(predict_real)

Number of predicted ones 1307
Number of predicted twos 9687


In [26]:
create_submission(predict_real,"submission_2_feb_26.csv")

submission_2_feb_26.csv  Created


# Altering learning rate 

In [31]:
def f1_score_xgb_boost(X,y,learning_rate=None):
    skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    cv =  skf.get_n_splits(X, y)
    scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
    cv_results = cross_validate(xgb.XGBClassifier(random_state=42,n_estimators=600,eta=learning_rate), X, y, cv=cv,scoring=scoring,verbose=1,n_jobs=-1)
    print("F1 score with ",sum(cv_results["test_f1_score"])/5)
    print("Accuracy score with ",sum(cv_results["test_accuracy"])/5)

In [33]:
f1_score_xgb_boost(X,y,0.2)
"""
estimators = 600
learning rate = 0.2
F1 score with  0.9490000332123933
Accuracy score with  0.9109347790742113
"""

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 10.7min remaining: 16.0min


F1 score with  0.9490000332123933
Accuracy score with  0.9109347790742113


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.7min finished


'\nestimators = 600\nlearning rate = 0.2\nF1 score with  0.9484400592750294\nAccuracy score with  0.9101786631226529\n'

In [34]:
xgb_c = xgb.XGBClassifier(random_state=42,n_estimators=600,eta=0.2)

In [35]:
xgb_c.fit(X,y)

In [36]:
predict = xgb_c.predict(test_data)

In [37]:
predict_real = label_encoder.inverse_transform(predict)

In [38]:
get_count_of_ones_and_twos(predict_real)

Number of predicted ones 1234
Number of predicted twos 9760


In [39]:
create_submission(predict_real,"submission_1_feb_27.csv")

submission_1_feb_27.csv  Created
