In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from scipy.stats import uniform

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge


from sklearn.preprocessing import LabelEncoder


In [None]:
X = pd.read_csv("X_2020.csv")
X.drop(columns=X.columns[0], 
        axis=1, 
        inplace=True)
X = pd.get_dummies(data=X,drop_first=True)
y = pd.read_csv("y_2020.csv")
y.drop(columns=y.columns[0], 
        axis=1, 
        inplace=True)
y = pd.get_dummies(data=y,drop_first=True)
y = y.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
gbc = GradientBoostingClassifier(n_iter_no_change=10,tol=0.001)
params = {    
    'learning_rate': [0.01, 0.1],
    'loss': ['deviance', 'exponential'],
    'max_depth': [3,  7, 15],
    'subsample':[1.0,0.5],
    'min_samples_split':[5,10,100],
    'min_samples_leaf':[ 2, 5, 10],
    'n_estimators' : [100, 200, 500],
}
grid_search = RandomizedSearchCV(n_iter=500,estimator=gbc, param_distributions=params,scoring="roc_auc",cv=5,verbose=2,n_jobs=-1)
grid_search.fit(X_train,y_train)

bestest = grid_search.best_estimator_
preds = bestest.predict_proba(X_test)
print(grid_search.best_params_)
bestest.fit(X_train, y_train)
print("AUC: " + str(roc_auc_score(y_test, preds[:,1])))
feat_importances = pd.Series(bestest.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.title("Gradient Boosting Classifier Feature Importance 2020")


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [8]:
from sklearn.model_selection import cross_val_score
def auc_cv(model,my_X,my_y):
    score = cross_val_score(model, my_X, my_y, scoring="roc_auc", cv = 20)
    return(score)

In [10]:
X = pd.read_csv("X_2020.csv")
X.drop(columns=X.columns[0], 
        axis=1, 
        inplace=True)
X = pd.get_dummies(data=X,drop_first=True)
y = pd.read_csv("y_2020.csv")
y.drop(columns=y.columns[0], 
        axis=1, 
        inplace=True)
y = pd.get_dummies(data=y,drop_first=True)
y = y.values.ravel()
gbc = GradientBoostingClassifier(n_iter_no_change=10,tol=0.001,subsample= 1.0, n_estimators= 200, min_samples_split= 10, min_samples_leaf= 5, max_depth= 7, loss= 'exponential', learning_rate= 0.1)
score = auc_cv(gbc,X,y)
print("Average is:", np.mean(score))
print(score)

Average is: 0.8562348329841216
[0.84863772 0.84647503 0.84241974 0.86920459 0.86532452 0.85831175
 0.86894805 0.80511379 0.85390698 0.86872151 0.84848282 0.86284623
 0.87493659 0.85730115 0.8536944  0.8417246  0.8581981  0.87241539
 0.8766924  0.85134132]


In [14]:
import statistics
print("Standard deviation is:",statistics.stdev(score))

Standard deviation is: 0.016083861443667972


In [15]:
X = pd.read_csv("X_1994.csv")
X.drop(columns=X.columns[0], 
        axis=1, 
        inplace=True)
X = pd.get_dummies(data=X,drop_first=True)
y = pd.read_csv("y_1994.csv")
y.drop(columns=y.columns[0], 
        axis=1, 
        inplace=True)
y = pd.get_dummies(data=y,drop_first=True)
y = y.values.ravel()
gbc = GradientBoostingClassifier(n_iter_no_change=10,tol=0.001,subsample= 1.0, n_estimators= 200, min_samples_split= 10, min_samples_leaf= 5, max_depth= 7, loss= 'exponential', learning_rate= 0.1)
score = auc_cv(gbc,X,y)
print("Average is:", np.mean(score))
print(score)
print("Standard deviation is:",statistics.stdev(score))

Average is: 0.9268456213860293
[0.9265332  0.92776085 0.92502415 0.9238027  0.92201279 0.9256535
 0.9283095  0.92373919 0.92289567 0.93425454 0.92097453 0.92533646
 0.92997921 0.92866801 0.92837213 0.92756213 0.92944637 0.93505085
 0.92443483 0.92710181]
Standard deviation is: 0.0036510666759140896
