In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
df = pd.read_csv('newData/IC_2004.csv')
df.head(100)

## Auswahl der nummerischen Spalten aus den Dataframe

Zudem wollen wir die `X` und `y` Werte festlegen, die wir für unser Modell benötigen.

**Daten**: `bhf_cat`, `dayname_cat`, `trainno_cat`, `feiertag_cat`, `ferien_cat` und `zeit`

**Vorherzusage**: `isadelay`

In [None]:
#X = df[['bhf_cat', 'dayname_cat', 'trainno_cat', 'zeit', 'temperature_c', 'air_pressure_hpa','relative_humidity', 'dew_point_c', 'wind_speed_kmh', 'time_since_last_station', 'time_since_first_station', 'stay_time', 'track_length' ,'track_length_since_start' , 'weather_condition_cat']]
X = df[['track_length_since_start', 'time_since_first_station', 'station_number', 'lat', 'lon', 'track_length', 'zeit']]

In [None]:
y = df['isadelay5']
len(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Auswahl des Klassifikators

Anhand der nachfolgenden Graphik wählen wir den Klassifikator.

![Sklearn Mindmap](https://scikit-learn.org/stable/_static/ml_map.png)

### Naive Bayes

#### Training


In [None]:
#train a dummy classifier to make predictions based on the most_frequent class value
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier(strategy="most_frequent")
dummy_classifier.fit( X_train,y_train )

In [None]:
y_dummy_classifier = dummy_classifier.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_dummy_classifier).sum()))
confusion_matrix(y_test, y_dummy_classifier, labels=[0,1])

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
gnb = GaussianNB()
model_nb = gnb.fit(X_train, y_train)

#### Auswertung
Durchgang 1: `Number of mislabeled points out of a total 9865 points : 2693` =  **27**% Falsch

In [None]:
y_pred_nb = model_nb.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred_nb).sum()))
confusion_matrix(y_test, y_pred_nb, labels=[0,1])

### Support Vector Machinen

#### Training

In [None]:
from sklearn import svm
model_svm = svm.SVC(gamma='scale')
model_svm.fit(X_train, y_train)  

#### Auswertung
Durchgang 1: `Number of mislabeled points out of a total 9865 points : 2715` = **27,5**% Falsch

In [None]:
y_pred_svm = model_svm.predict(X_test)

In [None]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred_svm).sum()))
confusion_matrix(y_test, y_pred_svm, labels=[0,1])

### Entscheidungsbaum
#### Training

Number of mislabeled points out of a total 9865 points : 2682

In [None]:
from sklearn import tree

model_dt = tree.DecisionTreeClassifier()
model_dt = model_dt.fit(X_train, y_train)

In [None]:
y_pred_dt = model_dt.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred_dt).sum()))
confusion_matrix(y_test, y_pred_dt, labels=[0,1])

In [None]:
# tree.plot_tree(model_dt)

## Mit Scaling

In [None]:
X_train.mean(), X_train.std()

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
X_train_sc.mean(), X_train_sc.std()

### Naive Bayes mit Scaling
Number of mislabeled points out of a total 9865 points : 2693

In [None]:
gnb_sc = GaussianNB()
model_nb_sc = gnb_sc.fit(X_train_sc, y_train)

In [None]:
y_pred_nb_sc = model_nb_sc.predict(X_test_sc)
print("Number of mislabeled points out of a total %d points : %d" % (X_test_sc.shape[0],(y_test != y_pred_nb_sc).sum()))
confusion_matrix(y_test, y_pred_nb_sc, labels=[0,1])

### SVM mit Scaling

In [None]:
from sklearn import svm
model_svm_sc = svm.SVC(gamma='scale')
model_svm_sc.fit(X_train_sc, y_train)  

In [None]:
y_pred_svm_sc = model_svm.predict(X_test)

In [None]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test_sc.shape[0],(y_test != y_pred_svm_sc).sum()))
confusion_matrix(y_test, y_pred_svm_sc, labels=[0,1])

### Entscheidungsbaum mit Scaling
#### Training

Number of mislabeled points out of a total 9865 points : 2685

In [None]:
model_dt_sc = tree.DecisionTreeClassifier()
model_dt_sc = model_dt_sc.fit(X_train, y_train)

In [None]:
y_pred_dt_sc = model_dt_sc.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred_dt_sc).sum()))
confusion_matrix(y_test, y_pred_dt_sc, labels=[0,1])

### Nearest Neighbor mit Scaling
Number of mislabeled points out of a total 9865 points : 4318

In [None]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
model_nnc_sc = NearestCentroid()
model_nnc_sc.fit(X_train_sc, y_train)

In [None]:
y_pred_nnc_sc = model_nnc_sc.predict(X_test_sc)
print("Number of mislabeled points out of a total %d points : %d" % (X_test_sc.shape[0],(y_test != y_pred_nnc_sc).sum()))
confusion_matrix(y_test, y_pred_nnc_sc, labels=[0,1])

## Neuronale Netze mit Keras und Tensorflow

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.regularizers import l2
model_nn = Sequential()

model_nn.add(Dense(units=12, activation='relu', kernel_regularizer=l2(0.01), input_dim=6))
model_nn.add(Dense(units=6, activation='relu', kernel_regularizer=l2(0.01)))
model_nn.add(Dense(units=6, activation='relu', kernel_regularizer=l2(0.01)))
model_nn.add(Dense(units=1, activation='sigmoid', kernel_regularizer=l2(0.01)))

adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0001, amsgrad=False)

model_nn.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
model_nn.fit(X_train, y_train, epochs=30, batch_size=32)

In [None]:
loss_and_metrics = model_nn.evaluate(X_test, y_test, batch_size=128)

In [None]:
y_pred_nn = model_nn.predict(X_test, batch_size=128)

In [None]:
y_pred_nn.

In [None]:
confusion_matrix(y_test, y_pred_nn > 0.5, labels=[0,1])