In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import time
from joblib import dump, load

In [None]:
def train_and_test(X, y, X_test, y_test, model, save = False):
    start_time = time.time()
    #fit model
    model.fit(X, y)
    #make and print some acc statistics
    test_pred = model.predict(X_test)
      
    acc = test_pred[test_pred == y_test]
    
    print(1 - ((y_test != test_pred).sum() / X_test.shape[0]))
    print('Feature importances', model.feature_importances_)
    print('acc: ' + str(len(acc)/ len(test_pred)))
    print(accuracy_score(y_test, test_pred))
    #print("Confusion Matrix: " + str(confusion_matrix(y_test, test_pred, labels=[0,1])))
    print('Confusion Matrix:', multilabel_confusion_matrix(y_test, test_pred))
    if save:
        dump(model, '../server/test_rf.joblib')
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
X, y, X_test, y_test = train()

In [None]:
from sklearn.dummy import DummyClassifier
dm = DummyClassifier(strategy="most_frequent")
dm.fit(X, y)
test_pred = dm.predict(X_test)
acc = test_pred[test_pred == y_test]
    
print("BASE§LINE: "+ str(len(acc)/ len(test_pred)))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model_dt = ExtraTreesClassifier(n_estimators=70, max_depth=12,
                                 random_state=0, n_jobs = -1)
train_and_test(X, y.ravel(), X_test, y_test.ravel(), model_dt)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_dt = RandomForestClassifier(n_estimators=70, max_depth=12,
                                 random_state=0, n_jobs = -1)
train_and_test(X, y, X_test, y_test, model_dt, True)

In [None]:
from sklearn.metrics import accuracy_score

# From here on automatic models for poster

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import random
%matplotlib inline
import matplotlib.pyplot as plt
import time

'''Change Size of jupyter notebook'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Read Data

In [None]:
path =  'data/combinedData/alltrains2019.csv'
dataset = pd.read_csv(path, index_col=False, compression='zip', engine='c')
dataset.columns

### Add Dates

In [None]:
dataset.drop(['temperature_c',
       'air_pressure_hpa', 'relative_humidity', 'dew_point_c',
       'wind_speed_kmh', 'weather_condition'], inplace = True, axis=1)
date = dataset['date'].astype('datetime64[D]')
dataset['month'] = date.dt.month
dataset['dayofweek'] = date.dt.dayofweek
dataset['hour'] = dataset['zeit']
dataset = dataset.dropna()


In [None]:
def balance(dset, label, random_state):
    # make the balance of delayed trains in the dataset better
    #split dataset
    minor = dset[dataset[label] == True] 
    major = dset[dataset[label] == False]
    #set major dataset to lenght of minor dataset by randomly seletcting datapoints
    major = major.sample(n=len(minor),random_state=random_state)
    #combine datsets
    balancedset = pd.concat([minor, major], ignore_index=True, sort=False)
    #I think this shuffels? and ensure length
    balancedset = balancedset.sample(n=len(balancedset),random_state=random_state).reset_index(drop=True)
    #print(len(balancedset))
    return balancedset

## Split Data

In [None]:
random_state = random.randint(1, 1000)
label = 'isadelay'
feat_labels = [ 'month',
                'dayofweek',
                'hour',
                   'time_since_first_station',
                'station_number',
                'lat',
                'lon',
                'stay_time',
                'time_since_last_station',
                'total_time',
                'delta_lon',
                'delta_lat'
                  ]
                # 'relative_humidity', 'dew_point_c', 'air_pressure_hpa', 'temperature_c', 'trainno', 'weather_condition', 'type', 'bhf', 'wind_speed_kmh',
df = dataset.sample(frac=1,random_state=random_state)
X = dataset[feat_labels]
y = dataset['adelay'] > 5 #dataset[label]
del df
print('SplitDataset')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

#and now with balanced
dataset_bal = balance(dataset, label, random_state)
X_bal = dataset_bal[feat_labels]
y_bal = dataset_bal[label]
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_bal, y_bal, test_size=0.1, random_state=random_state)

### Held out something

In [None]:
def held_out_label(df, feat_labels, label, held_out,held_out_var):
    X = df[feat_labels]
    y = df[label]
    X_train = df[df[held_out_var] != held_out][feat_labels]
    X_test = df[df[held_out_var] == held_out][feat_labels]
    y_train = df[df[held_out_var] != held_out][label]
    y_test = df[df[held_out_var] == held_out][label]
    return X_train, X_test, y_train, y_test

In [None]:
# 51.517899 = Dortmund Hbf
held_out = 51.517899
held_out_var = 'lat'
X_train, X_test, y_train, y_test = held_out_label(dataset, feat_labels, label, held_out, held_out_var)
X_train_bal, X_test_bal, y_train_bal,y_test_bal = held_out_label(dataset_bal, feat_labels, label, held_out, held_out_var)

## ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

def plot_roc_auc(actual, preds):
    fpr, tpr, thresholds = roc_curve(actual, preds[:,1])
    plt.plot(fpr, tpr ,'r')
    plt.plot([0,1],[0,1],'b')
    plt.xlabel("False Posetive")
    plt.ylabel("True Posetive")
    plt.title('AUC: {}'.format(auc(fpr,tpr)))
    plt.show()

## Testing

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier

'''Define your models here'''
models = {}
#models['RandomForest'] = RandomForestClassifier(n_estimators=len(feat_labels)-2, max_depth=12, n_jobs=-1,random_state=random_state),
models['DecisionTree'] = DecisionTreeClassifier(max_depth = 12, random_state=random_state)
models['RandomForest'] = RandomForestClassifier(n_estimators=64, max_depth = 12, n_jobs=-1,random_state=random_state)
models['ExtraTrees'] = ExtraTreesClassifier(n_estimators=64,max_depth = 12, n_jobs=-1,random_state=random_state)

'''Calculate Baseline Infos'''
zeroR = DummyClassifier(strategy="most_frequent").fit( X_train,y_train).predict(X_test)
zeroR_bal = DummyClassifier(strategy="most_frequent").fit( X_train_bal,y_train_bal).predict(X_test_bal)
infos = {'random_state': random_state,
         'ZeroR': (1 - ((y_test != zeroR).sum() / X_test.shape[0])),
         'ZeroR_bal': (1 - ((y_test_bal != zeroR_bal).sum() / X_test_bal.shape[0]))
        }

In [None]:
for model in models:
    print(model)
    
    if False:
        scores = cross_val_score(models[model], X, y, cv=StratifiedKFold(n_splits=5))
        infos[model + "_cross_val_strat"] = "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
        
    t0 = time.time()    
    scores = cross_val_score(models[model], X, y, cv=5)    
    infos[model + "_cross_val"] = "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
    print(scores)
    models[model].fit(X_train ,y_train)
    
    t1 = time.time()
    print(t1-t0)
    
    model_classified = models[model].predict(X_test)
    
    infos[model] = 1 - ((y_test != model_classified).sum() / X_test.shape[0])

    infos[model + '_matrix'] = confusion_matrix(y_test, model_classified, labels=[0,1])
    

    '''Balanced'''
    
    if False:
        scores = cross_val_score(models[model], X_bal, y_bal, cv=StratifiedKFold(n_splits=5))
        infos[model + "_cross_val_strat_bal"] = "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
        
        scores = cross_val_score(models[model], X_bal, y_bal, cv=5)    
        infos[model + "_cross_val_bal"] = "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
    
    models[model].fit(X_train_bal ,y_train_bal)
    model_classified = models[model].predict(X_test_bal)
    infos[model + "_bal"] = 1 - ((y_test_bal != model_classified).sum() / X_test_bal.shape[0])

    infos[model + '_matrix_bal'] = confusion_matrix(y_test_bal, model_classified, labels=[0,1])
    
    #for feature in zip(feat_labels, models[model].feature_importances_):
    #   print(feature)

In [None]:
infos

In [None]:
infos

In [None]:
plot_roc_auc(y_test, DummyClassifier(strategy="most_frequent").fit( X_train,y_train).predict_proba(X_test))
plot_roc_auc(y_test, models['RandomForest'].predict_proba(X_test))
plot_roc_auc(y_test, models['ExtraTrees'].predict_proba(X_test))

In [None]:
infos['train_acc']={}
for model in models:
    print(model)
    for train in dataset['trainno'].unique():
        print(train,end=', ')
        X_train, X_test, y_train, y_test = held_out_label(dataset, feat_labels, label, train, 'trainno')
        models[model].fit(X_train ,y_train)
        model_classified = models[model].predict(X_test)
        infos['train_acc'][train] = 1 - ((y_test != model_classified).sum() / X_test.shape[0])
    
    #infos[train + '_matrix'] = confusion_matrix(y_test, model_classified, labels=[0,1])

In [None]:
infos['train_acc']={}
train_base={}
for train in dataset['trainno'].unique():
    print(train,end=', ')
    test = df[df[held_out_var] == held_out]
    train_base[train] = len(test[test['isadelay5'] == True]) / len(test[test['isadelay5'] == False])

In [None]:
sum(list(train_acc.values())) / len(list(train_acc.values()))

In [None]:
sum(list(train_base.values())) / len(list(train_base.values()))

In [None]:
len(dataset['trainno'].unique())

In [None]:
np.var(list(train_acc.values()))

In [None]:
infos['bhf_acc']={}
for model in models:
    print(model)
    for train in dataset['bhf'].unique():
        print(train,end=', ')
        X_train, X_test, y_train, y_test = held_out_label(dataset, feat_labels, label, train, 'bhf')
        models[model].fit(X_train ,y_train)
        model_classified = models[model].predict(X_test)
        infos['bhf_acc'][train] = 1 - ((y_test != model_classified).sum() / X_test.shape[0])

In [None]:
sum(list(bhf_acc.values())) / len(list(bhf_acc.values()))

In [None]:
dataset.head()

In [None]:
dataset[['month',
                'dayofweek',
                'hour',
                'time_since_first_station',
                'station_number',
                'lat',
                'lon',
                'stay_time',
                'time_since_last_station',
                'total_time',
                'delta_lon',
                'delta_lat',
                'start_lat',
                'start_lon',
                'destination_lat',
                'destination_lon',
                ]].rename(columns={'month': 'Monat', 'dayofweek': 'Wochentag', 'hour': 'Uhrzeit', 'time_since_first_station': 'Gesamte Fahrzeit', 'station_number': 'Anzahl Halte',
                                  'lat': 'Breitengrad', 'lon': 'Längengrad', 'lon': 'Längengrad', 'stay_time': 'Aufenthaltszeit', 'time_since_last_station': 'Fahrtzeit letzter Halt',
                                  'total_time': 'Planmäßige Fahrzeit', 'total_time': 'Planmäßige Fahrzeit', 'delta_lon': 'Breitengrad Änderung', 'delta_lat': 'Längengrad Änderung'}).corr().style.background_gradient(cmap='coolwarm')


In [None]:
print(__doc__)

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets


original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

plt.figure()

for label, color, setting in [('No shrinkage', 'orange',
                               {'learning_rate': 1.0, 'subsample': 1.0}),
                              ('learning_rate=0.1', 'turquoise',
                               {'learning_rate': 0.1, 'subsample': 1.0}),
                              ('subsample=0.5', 'blue',
                               {'learning_rate': 1.0, 'subsample': 0.5}),
                              ('learning_rate=0.1, subsample=0.5', 'gray',
                               {'learning_rate': 0.1, 'subsample': 0.5}),
                              ('learning_rate=0.1, max_features=2', 'magenta',
                               {'learning_rate': 0.1, 'max_features': 2})]:
    params = dict(original_params)
    params.update(setting)

    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)
    
    print(1 - ((y_test != clf.predict(X_test)).sum() / X_test.shape[0]))
    # compute test set deviance
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')

plt.show()

In [None]:
import seaborn as sns
# udfs ----

# function for creating a feature importance dataframe
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

# plotting a feature importance dataframe (horizontal barchart)
def var_imp_plot(imp_df, title):
    imp_df.columns = ['feature', 'feature_importance']
    a4_dims = (11.7, 6)
    fig, ax = plt.subplots(figsize=a4_dims)
    sns.barplot(ax=ax ,x = 'feature_importance', y = 'feature', data = imp_df, orient = 'h', color = 'royalblue') \
       .set_title(title, fontsize = 20)

In [None]:
base_imp = imp_df(X_train.rename(columns={'month': 'Monat', 'dayofweek': 'Wochentag', 'hour': 'Uhrzeit', 'time_since_first_station': 'Gesamte Fahrzeit', 'station_number': 'Anzahl Halte',
                                  'lat': 'Breitengrad', 'lon': 'Längengrad', 'lon': 'Längengrad', 'stay_time': 'Aufenthaltszeit', 'time_since_last_station': 'Fahrtzeit letzter Halt',
                                  'total_time': 'Planmäßige Fahrzeit', 'total_time': 'Planmäßige Fahrzeit', 'delta_lon': 'Breitengrad Änderung', 'delta_lat': 'Längengrad Änderung'}).columns, models['RandomForest'].feature_importances_)
base_imp

In [None]:
var_imp_plot(base_imp, 'Feature Importance')

In [None]:
from sklearn.base import clone 

def drop_col_feat_imp(model, X_train, y_train, random_state = 42):
    
    # clone the model to have the exact same specification as the one initially trained
    model_clone = clone(model)
    # set random_state for comparability
    model_clone.random_state = random_state
    # training and scoring the benchmark model
    model_clone.fit(X_train, y_train)
    benchmark_score = model_clone.score(X_train, y_train)
    # list for storing feature importances
    importances = []
    
    # iterating over all columns and storing feature importance (difference between benchmark and new model)
    for col in X_train.columns:
        print(col)
        model_clone = clone(model)
        model_clone.random_state = random_state
        model_clone.fit(X_train.drop(col, axis = 1), y_train)
        drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
        importances.append(benchmark_score - drop_col_score)
    
    importances_df = imp_df(X_train.columns, importances)
    return importances_df

In [None]:
drop_imp = drop_col_feat_imp(models['RandomForest'], X_train, y_train)
var_imp_plot(drop_imp, 'Drop Column feature importance')

In [None]:
drop_imp = drop_col_feat_imp(models['RandomForest'], X_train, y_train)
var_imp_plot(drop_imp, 'Drop Column feature importance')

In [None]:
dataset

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import random
import json
import sys



print("Writeing to file #1")
print(infos)
#with open('info.json', 'w') as fp:
#   json.dump(infos, fp)
print("Done")
infos['train_acc']={}
for model in models:
    print("\n",)
    print(model)
    infos['train_acc'][model]={}
    for train in dataset['trainno'].unique():
        print(train,end=', ')
        X_train, X_test, y_train, y_test = held_out_label(dataset, feat_labels, label, train, 'trainno')
        models[model].fit(X_train ,y_train)
        model_classified = models[model].predict(X_test)
        infos['train_acc'][model][train] = 1 - ((y_test != model_classified).sum() / X_test.shape[0])
    print(str(infos))

print("\n\n\n\n INFOS:")
print(str(infos))
print("Done\n\n\n\n\n\n")

infos['bhf_acc']={}
for model in models:
    print("\n")
    print(model)
    infos['bhf_acc'][model]={}
    for train in dataset['bhf'].unique():
        print(train,end=', ')
        X_train, X_test, y_train, y_test = held_out_label(dataset, feat_labels, label, train, 'bhf')
        models[model].fit(X_train ,y_train)
        model_classified = models[model].predict(X_test)
        infos['bhf_acc'][model][train] = 1 - ((y_test != model_classified).sum() / X_test.shape[0])
    print(str(infos))


print("\n\n\n\n INFOS:")
print(str(infos))
print("Done\n\n\n\n\n\n")
