In [1]:
import warnings
warnings.filterwarnings("ignore")

import xgboost
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, f1_score, auc, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score

from majority import MajorityVoteClassifier

from pipelines import extra_features, pipe, pipe2


In [2]:
data = pd.read_csv('train.csv')
X = data.drop(['cost_category'], axis=1)
y = data['cost_category']

<IPython.core.display.Javascript object>

In [3]:
X = pipe2(X)

le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearSVC(penalty='l2', loss='squared_hinge', 
                     C=0.1, class_weight='balanced',
                     multi_class='ovr', random_state=123))
])


fr  = RandomForestClassifier(n_estimators=400,
                             criterion='gini', 
                             random_state=123,
                             max_depth=6,
                             n_jobs=-1)


xgb = XGBClassifier(n_estimators=500,
                          max_depth=8, n_jobs=-1)

lr = Pipeline([
    ('sc', StandardScaler()),
    ('lr', LogisticRegression(penalty='l2', C=0.1,
                              solver='lbfgs', multi_class='ovr',
                              max_iter=10000))
])

sg = Pipeline([
    ('scaler', StandardScaler()),
    ('sg', SGDClassifier(loss='log_loss',
                         eta0=0.01,
                         learning_rate='optimal'))
])

cat = CatBoostClassifier(iterations=100,
                        max_depth=8,
                        learning_rate=0.1,
                        logging_level='Silent'
                        )


vt = VotingClassifier(estimators=[('sg', sg), ('fore', fr), ('cat', cat), ('boost', xgb), ('reg' , lr)], 
                     voting = 'soft')

mv = MajorityVoteClassifier(classifiers=[ fr, cat, xgb, lr, sg],
                             vote='probability',
                             weights=[ 0.5, 0.67, 0.7, 0.43, 0.6])


In [9]:
labels = ['Linear SVC', 'Random Forest', "CatBoost", 'XGBoost', 'Logistic Regression', 'SGD Classifier', 'Majority','Ensemble']
for clf, label in zip ([svm, fr, cat, xgb, lr, sg, mv, vt], labels):
    
    fold = StratifiedKFold(n_splits=3)
    auc_scores = []
    f1_scores = []
    
    for f, (train_idx, valid_idx) in enumerate(fold.split(X, y)):
        X_train, X_test = X[train_idx], X[valid_idx]
        y_train, y_test = y[train_idx], y[valid_idx]
        
        clf.fit(X_train, y_train)
        
        if clf != svm:
            preds = clf.predict_proba(X_test)
            auc = roc_auc_score(y_test, preds, multi_class='ovr')
            auc_scores.append(auc)
            
       
        y_pred =clf.predict(X_test)
        f1s = f1_score(y_test, y_pred, average='weighted')
        f1_scores.append(f1s)
       
    
    print("ROC AUC: %.3f (+/- %.3f) [%s]" % (np.mean(auc_scores), 
                                            np.std(auc_scores), label))
              
    print("F1 Score: %.3f (+/- %.3f) [%s]" % (np.mean(f1_scores), 
                                            np.std(f1_scores), label))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: nan (+/- nan) [Linear SVC]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.487 (+/- 0.002) [Linear SVC]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.808 (+/- 0.005) [Random Forest]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.407 (+/- 0.008) [Random Forest]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.807 (+/- 0.002) [CatBoost]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.352 (+/- 0.094) [CatBoost]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.717 (+/- 0.001) [XGBoost]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.210 (+/- 0.049) [XGBoost]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.807 (+/- 0.004) [Logistic Regression]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.486 (+/- 0.003) [Logistic Regression]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.781 (+/- 0.004) [SGD Classifier]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.457 (+/- 0.012) [SGD Classifier]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.806 (+/- 0.011) [Majority]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.364 (+/- 0.079) [Majority]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ROC AUC: 0.808 (+/- 0.010) [Ensemble]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1 Score: 0.419 (+/- 0.050) [Ensemble]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, 
                                                    stratify=y, random_state=42)

print("10-Fold ROC-AUC Cross Validation Scores:\n ")
for clf, label in zip ([svm, fr, cat, xgb, lr, sg, mv, vt], labels):
    if clf != svm:
        scores = cross_val_score(clf,
                                 X_train, y_train,
                                 cv=10,
                                 scoring='roc_auc_ovr')
        
        print("ROC AUC: %.3f (+/- %.3f) [%s]" % (scores.mean(), 
                                            scores.std(), label))


10-Fold ROC-AUC Cross Validation Scores:
 
ROC AUC: 0.810 (+/- 0.008) [Random Forest]
ROC AUC: 0.818 (+/- 0.009) [CatBoost]
ROC AUC: 0.781 (+/- 0.009) [XGBoost]
ROC AUC: 0.809 (+/- 0.009) [Logistic Regression]
ROC AUC: 0.778 (+/- 0.010) [SGD Classifier]
ROC AUC: 0.816 (+/- 0.008) [Majority]
ROC AUC: 0.816 (+/- 0.008) [Ensemble]


In [12]:
print("10-Fold Accuracy Cross Validation Scores:\n ")
for clf, label in zip ([svm, fr, cat, xgb, lr, sg, vt], labels):
   
    scores = cross_val_score(clf,
                             X_train, y_train,
                             cv=10,
                             scoring='accuracy')

    
    print("ROC AUC: %.3f (+/- %.3f) [ %s ]" % (scores.mean(), 
                                        scores.std(), label))

10-Fold Accuracy Cross Validation Scores:
 
ROC AUC: 0.511 (+/- 0.012) [ Linear SVC ]
ROC AUC: 0.498 (+/- 0.010) [ Random Forest ]
ROC AUC: 0.540 (+/- 0.015) [ CatBoost ]
ROC AUC: 0.480 (+/- 0.008) [ XGBoost ]
ROC AUC: 0.535 (+/- 0.009) [ Logistic Regression ]
ROC AUC: 0.506 (+/- 0.015) [ SGD Classifier ]
ROC AUC: 0.534 (+/- 0.007) [ Majority ]


In [13]:
clfs = [fr, cat, xgb, lr, sg, mv, vt] 
labels = ['Random Forest', 'XGBoost', 'Logistic Regression', 'SGD Classifier', 'Ensemble']
colors = ['black', 'orange', 'blue', 'red', 'brown']
lins = [':', '--', '-*', '-', '-.']

for clf, l, clc, ls in zip(clfs, labels, colors, lins):
    clf.fit(X_train, y_train)
    
    preds = clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_score(y_test, y_score=preds, multi_class='ovr')
    roc_auc = auc(x=fpr, y=tpr)
    
    plt.plot(x=fpr, y=tpr, 
             linestyle=ls, 
             color=clc,
             label=' %s [auc = %.3f]' % (label, roc_auc))

plt.plot([0, 1], [0, 1],
        color='gray', linestyle='--',
        linewidth=3, label='Random Guessing [auc = 0.50]')

plt.legend(loc='lower right')
plt.xlim([-1, 1.1])
plt.ylim([-1, 1.1])
plt.grid(alpha=0.35)
plt.xlabel('False Positive Rate [FPR]')
plt.ylabel('True Positive Rate [TPR]')
plt.show();

NameError: name 'roc_score' is not defined

In [28]:
roc_curve??