In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as pl
import scipy.stats as stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import seaborn as sns

In [None]:
df = pd.read_csv('sap_storing_data_hu_project.csv')

In [None]:
def column_outlier(strength, dataframe, columns):
    temp_dataframe = dataframe.copy()
    try:
        if strength == 's':
            strength=3
        elif strength == 'a':
            strength=1.5
    except:
        print("Invalid strength")
    for column in columns:
        Q1 = temp_dataframe[column].quantile(0.25)
        Q3 = temp_dataframe[column].quantile(0.75)
        IQR = Q3 - Q1
        temp_dataframe = temp_dataframe[~((temp_dataframe[column] < (Q1 - strength * IQR)) |(temp_dataframe[column] > (Q3 + strength* IQR)))]
    return temp_dataframe

In [None]:
# ik zorg er hier voor dat de 'stm_progfh_in_duur' kolom naar floats gecast wordt
df.stm_progfh_in_duur = df.stm_progfh_in_duur.apply(lambda x: float(str(x).replace('-','').replace('*','').strip()))

# delete stm_fh_duur outliers
#df = column_outlier('a', df.copy(), ['stm_fh_duur'])


In [None]:
o_df = pd.read_csv('Oorzaakcodes.csv', sep = ';')
o_df = o_df.rename(columns={'Code': 'stm_oorz_code'})

In [None]:
df = df.merge(o_df, on = 'stm_oorz_code', how = 'outer')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.drop(df[df.stm_fh_duur == 0].index, inplace=True)

In [None]:
df.stm_fh_duur.describe()

In [None]:
new_df_copy = column_outlier('a', df.copy(), ['stm_fh_duur'])


In [None]:
df.head()

In [None]:
sns.kdeplot(new_df_copy.stm_fh_duur)

In [None]:
mean_dur_df = column_outlier('a', df.copy(), ['stm_fh_duur'])[['stm_fh_duur', 'Oorzaak']].groupby('Oorzaak', as_index = False).mean()

In [None]:
mean_dur_df

In [None]:
def cause_predict(cause):
    
    valid_causes = list(mean_dur_df.Oorzaak)
    if cause in valid_causes:
        estimate = float(mean_dur_df[mean_dur_df.Oorzaak == cause].stm_fh_duur)
    else:
        estimate = None
    
    return estimate

In [None]:
cause_predict('Applicatie/softwarefout')

In [None]:
test_df = new_df_copy.sample(frac=0.1)[['Oorzaak', 'stm_fh_duur']]
test_df = test_df.dropna()

y_pred = [cause_predict(x) for x in test_df.Oorzaak]
y_true = list(test_df.stm_fh_duur)

In [None]:
pd.DataFrame([abs(y_pred[i]-y_true[i]) for i in  range(len(y_pred))]).describe()


In [None]:
prog_df = df[['stm_fh_duur', 'stm_progfh_in_duur']].dropna()
test_prog_df = prog_df.copy().sample(frac=0.1)

y_pred = list(test_prog_df.stm_progfh_in_duur)
y_true = list(test_prog_df.stm_fh_duur)


In [None]:
pd.DataFrame([abs(y_pred[i]-y_true[i]) for i in  range(len(y_pred))]).describe()

In [None]:
# hoe groot de bins worden die de 'stm_fh_duur' kolom verdelen
error_margin = 10

In [None]:
t_df = df[['stm_fh_duur', 'stm_progfh_in_duur']].dropna()

<h2> Wat is de kans dat de prognose in de juiste bin zit? </h2>

In [None]:
t_df['succes'] = np.where((abs(t_df['stm_fh_duur'] - (t_df['stm_progfh_in_duur']))) < error_margin, 1, 0)
t_df.succes.value_counts()[1] / t_df.succes.size

In [None]:
p_df = df[['stm_fh_duur', 'Oorzaak']].sample(frac=0.05).dropna()

<h2> Wat is de kans dat de 'cause predict' voorspelling in de juiste bin zit? </h2>

In [None]:
p_df['pred'] = p_df.Oorzaak.apply(lambda x : cause_predict(x))
p_df['succes'] = np.where((abs(p_df['stm_fh_duur'] - (p_df['pred']))) < error_margin, 1, 0)
p_df.succes.value_counts()[1] / p_df.succes.size

In [None]:
df_co = column_outlier('a', df, ['stm_fh_duur'])

In [None]:
bins = [x for x in range(0,900,error_margin)]
labels = [x for x in range(1,len(bins))]

df_co['duration_bin'] = pd.cut(df_co['stm_fh_duur'], bins = bins, labels=labels)

In [None]:
df_co.corr().stm_fh_duur

In [None]:
from sklearn.model_selection import train_test_split

df_co = df_co[['stm_equipm_nr_mld', 'stm_prioriteit', 'stm_km_tot_mld','stm_fh_duur','duration_bin', 'stm_rapportage_jaar']].sample(frac = 0.5).dropna()
X = df_co[['stm_equipm_nr_mld', 'stm_prioriteit', 'stm_km_tot_mld', 'stm_rapportage_jaar']]
y = df_co.duration_bin

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

len(X_train)

<h2> MODEL COMPARISON </h2>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessClassifier

# svm, random forests en gaussion process classifier duren veels te lang op deze dataset

clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
print('1/3')
clf2 = KNeighborsClassifier(n_neighbors = 1).fit(X_train, y_train)
print('2/3')

In [None]:
def get_accuracy(r,zip_list):

    return len([ _ for x in zip_list if x[0]+r >= x[1] >= x[0]-r])/len(y_test)

In [None]:
x_ax = [error_margin +(20*x) for x in range(0,30)]

y1_ax = [get_accuracy(x, list(zip(list(y_test), list(clf.predict(X_test))))) for x in range(0,30)]
y2_ax = [get_accuracy(x, list(zip(list(y_test), list(clf2.predict(X_test))))) for x in range(0,30)]

plt.plot(x_ax,y1_ax, label = 'Decision Tree')
plt.plot(x_ax,y2_ax, label = 'kNN')

plt.xlabel('bin size in minutes')
plt.ylabel('accuracy score')

plt.legend()
plt.show()

In [None]:
y1_ax[0]

In [None]:
y2_ax[0]