In [1]:
import pandas as pd
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
from sklearn import tree
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint
from pprint import pprint
#from google.colab import drive
#drive.mount('/content/drive')
from dataset_tool import load_data

# Data loading

In [3]:
x_path = "X_train_update.csv"
y_path = "Y_train_CVw08PX.csv"
X_train, X_test, y_train, y_test = load_data(x_path,y_path)

In [0]:
weighted_f1_scores = {}

# Models

## Decision tree

Weigthed f1 score on test set : 0.69


In [0]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
weighted_f1_scores["decision tree"] = f1_score(y_test, y_pred,average="weighted")

# Bagging
Weighted f1 score : 0.70

In [0]:
clf_bag = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5)
clf_bag.fit(X_train, y_train)
y_pred_bag = clf_bag.predict(X_test)
weighted_f1_scores["bagging"]  = f1_score(y_test, y_pred_bag,average="weighted")

## Random forest
F1 weigted score : 0.73 (without grid search)


In [0]:
model_params = {
    'n_estimators': randint(4,200),
    'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
    'min_samples_split': uniform(0.01, 0.199)
}

In [0]:
rf_model = RandomForestClassifier()
clf_rf = RandomizedSearchCV(rf_model, model_params, n_iter=10, cv=5, random_state=1,n_jobs = -1)
model = clf_rf.fit(X_train,y_train)

KeyboardInterrupt: ignored

In [0]:
pprint(model.best_estimator_.get_params())

In [0]:
y_pred_rf = clf_rf.predict(X_test)
weighted_f1_scores["random forest classifier"]  = f1_score(y_test, y_pred_rf,average="weighted")

##Adaboost classifier 

In [0]:
estimators = 70
alpha = 0.01
random = None
model = AdaBoostClassifier(n_estimators = estimators, learning_rate = alpha, random_state = random )

In [0]:
res = model.fit(X_train, y_train)

In [0]:
model.score(X_train, y_train)

In [0]:
y_pred = model.predict(X_test)
f1_score(y_test, y_pred,average="weighted")

In [0]:
param_dist = {
 'n_estimators': randint(100,200),
 'learning_rate' : [0.1,0.3,1] }
pre_gs_inst = RandomizedSearchCV(AdaBoostClassifier(),
 param_distributions = param_dist,
 cv=3,
 n_iter = 5,
 n_jobs=-1)

In [0]:
pre_gs_inst.fit(X_train, y_train)

KeyboardInterrupt: ignored