In [102]:
import pandas as pd
import csv
import numpy as np
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import xgboost as xgb
#from deepstack.ensemble import StackEnsemble
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

In [103]:
def get_col_types(data):
    features_num = len(data.columns)
    nominal_cols = set()
    nominal_cols_indexes = set()
    for index in range(83):
        column_name = 'A'+str(index+1)
        if column_name in data.columns:
            nominal_cols.add(column_name)
            nominal_cols_indexes.add(index)
    binary_cols = set([col for col in data.loc[:, data.columns != 'CLASS'] if np.isin(data[col].dropna().unique(), [0, 1]).all()])
    numeric_cols = set()
    union_set = nominal_cols.union(binary_cols)
    for col_name in data.columns:
        if col_name not in union_set and col_name != 'CLASS':
            numeric_cols.add(col_name)
    return nominal_cols, nominal_cols_indexes, binary_cols, numeric_cols

def fill_missing_data(data, nominal_cols, binary_cols, numeric_cols, dist_dict):
    union_set = nominal_cols.union(binary_cols)
    for column_name in union_set:
        #data[column_name] = data[column_name].fillna(data[column_name].mode().iloc[0])
        if column_name not in dist_dict.keys():           
            distribution = data[column_name].dropna().value_counts(normalize=True)
            dist_dict[column_name] = distribution
        else:
            distribution = dist_dict[column_name]
        missing = data[column_name].isnull()
        data.loc[missing,column_name] = np.random.choice(distribution.index, size=len(data[missing]),p=distribution.values)    
    for column_name in numeric_cols:
        if column_name not in dist_dict.keys():           
            average = data[column_name].mean()
            dist_dict[column_name] = average
        else:
            average = dist_dict[column_name]
        data[column_name].fillna((average), inplace=True)   
        
def transform_categorical_columns(data, nominal_cols, dummies=False, labelencoder=False):
    target_encoder = LabelEncoder()
    if dummies:
        new_Data = pd.get_dummies(data.iloc[:,data.columns != "CLASS"], columns=nominal_cols)
    else:
        new_Data = data
    if labelencoder:
        for column_name in nominal_cols:
            new_Data[column_name] = pd.Series(target_encoder.fit_transform(new_Data[column_name]))
    if 'CLASS' in data.columns:
            new_Data["CLASS"] = pd.Series(target_encoder.fit_transform(new_Data["CLASS"]))
    return new_Data
    
def read_data(filename, dist_dict, dummies=False, labelencoder=False):
    path = '/kaggle/input/bgutreatmentoutcome/' + filename
    data = pd.read_csv(path)
    if len(dist_dict) > 0:
        for column_name in data.columns:
            if column_name not in dist_dict.keys():
                del data[column_name]
    else:
        data = data.dropna(how='all', axis=1)
    nominal_cols, nominal_cols_indexes, binary_cols, numeric_cols = get_col_types(data)
    fill_missing_data(data, nominal_cols, binary_cols, numeric_cols, dist_dict)
    data = transform_categorical_columns(data, nominal_cols, dummies, labelencoder)
    return data, nominal_cols_indexes       

In [104]:
def write_prediction(pred):
    current_time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    with open('test_pred_' + current_time + '.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        spamwriter.writerow(['Id'] + ['ProbToYes'])
        for index in range(len(pred)):
            spamwriter.writerow([index+1] + [pred[index]])

In [105]:
def calculate_auc(pred, actual):
    fpr, tpr, thresholds = metrics.roc_curve(actual, pred, pos_label=1)
    return metrics.auc(fpr, tpr)

In [106]:
def calculateAUC(label, classifier, X, Y):
    pred = classifier.predict(X)
    print(label + ':')
    print('\tScore: ' + str(classifier.score(X, Y)))
    print('\tAUC: ' + str(roc_auc_score(Y, pred)))
    print('\tAUC PROB: ' + str(roc_auc_score(Y, classifier.predict_proba(X)[:,1])))
    return pred

In [107]:
gradientBoostingClassifier = GradientBoostingClassifier(n_estimators= 3000, max_leaf_nodes= 4, max_depth= None, random_state= 2,
                   min_samples_split= 200, learning_rate= 0.01, subsample= 0.5)
adaBoostClassifier = AdaBoostClassifier()
randomForestClassifier= RandomForestClassifier()
decisionTreeClassifier = DecisionTreeClassifier(min_samples_split=200)


parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

xgbClassifier = GridSearchCV(xgb.XGBClassifier(),
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

catBoostClassifier = CatBoostClassifier(iterations=10000, learning_rate=0.01, depth=2, subsample=0.5, verbose=False)

svc = SVC(kernel='rbf', class_weight='balanced', gamma=0.01, C=1e3, probability=True)

estimators = [
    ('rf', RandomForestClassifier(verbose=0, n_estimators=200, max_depth=15, n_jobs=20, min_samples_split=30)),
    ('etr', ExtraTreesClassifier(verbose=0, n_estimators=200, max_depth=10, n_jobs=20, min_samples_split=20)),
    ('gbm', gradientBoostingClassifier),
    ('catbm', catBoostClassifier)
]

stackingClassifier = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

In [108]:
#classifier = xgb.XGBClassifier(n_estimators= 2000, max_leaf_nodes= 4, random_state= 2, min_samples_split= 500, learning_rate= 0.03, subsample= 0.5)
classifier = stackingClassifier

In [109]:
dist_dict = {}
if False:#classifier is catBoostClassifier:
    train_data, nominal_cols_indexes = read_data('train.CSV', dist_dict)
    test_data, _ = read_data('test.CSV', dist_dict)
else:
    train_data, nominal_cols_indexes = read_data('train.CSV', dist_dict, labelencoder=True)
    test_data, _ = read_data('test.CSV', dist_dict, labelencoder=True)
X_train = train_data.iloc[:,train_data.columns != 'CLASS']
Y_train =  train_data["CLASS"]
X_train, X_validate, Y_train, Y_validate = train_test_split(train_data.iloc[:,train_data.columns != 'CLASS'], train_data["CLASS"], test_size=0.3, random_state=42)

  if (await self.run_code(code, result,  async_=asy)):


In [110]:
print('not doing cross val')
#num_of_folds = 5
#scores = cross_val_score(classifier, X_train, Y_train, cv=num_of_folds, scoring='roc_auc')
#print(str(num_of_folds) + ' - fold scores:')
#print(scores)
#print('avg = ' +str(np.mean(scores)))

doing cross val


KeyboardInterrupt: 

Run the chosen classifier on all of the data:

In [None]:
if False:#classifier is catBoostClassifier:
    classifier.fit(X_train, Y_train, nominal_cols_indexes)
else:
    classifier.fit(X_train, Y_train)
calculateAUC('Train', classifier, X_train, Y_train)
calculateAUC('Validate' ,classifier, X_validate, Y_validate)
test_pred = classifier.predict_proba(test_data)[:,1]
write_prediction(test_pred)