In [2]:
from preprocess import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

input_features = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']

X_train = training_df[input_features]
y_train = training_df['sentiment']

X_test = test_df[input_features]
y_test = test_df['sentiment']

# Parameter distribution for randomized search over Gradient Boosting parameters
param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform randomized search
random_search = RandomizedSearchCV(GradientBoostingClassifier(), param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print('RandomizedSearch CV best score : {:.4f}\n\n'.format(random_search.best_score_))
print('Parameters that give the best results :','\n\n', (random_search.best_params_))
print('\n\nEstimator that was chosen by the search :','\n\n', (random_search.best_estimator_))

# Create the final model using the best parameters
final_model = GradientBoostingClassifier(n_estimators=random_search.best_params_['n_estimators'],
                                         learning_rate=random_search.best_params_['learning_rate'],
                                         max_depth=random_search.best_params_['max_depth'],
                                         min_samples_split=random_search.best_params_['min_samples_split'],
                                         min_samples_leaf=random_search.best_params_['min_samples_leaf'])
final_model.fit(X_train, y_train)

# Evaluate final model
y_pred = final_model.predict(X_test)
print(accuracy_score(y_test, y_pred))


RandomizedSearch CV best score : 0.8140


Parameters that give the best results : 

 {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.1}


Estimator that was chosen by the search : 

 GradientBoostingClassifier(max_depth=5, min_samples_leaf=2,
                           min_samples_split=10, n_estimators=50)
0.811625


In [3]:
import pickle

with open('exported_models/gradient_boosting.pkl', 'wb') as f:
    pickle.dump(final_model, f)