## Setup

Libraries importieren und Dataframes initialisieren

In [None]:
%matplotlib inline

# Pandas
import pandas as pd

# Plots
import seaborn as sns
import matplotlib.pylab as plt

# Farben von Pyplot
from matplotlib.colors import ListedColormap

# Numpy (performante listen)
import numpy as np

import missingno as mn

# Datasets, Klassifikatoren und Beurteilungshelfer
from sklearn import svm, metrics, neighbors, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix

In [None]:
# https://github.com/mwaskom/seaborn-data
df_iris = sns.load_dataset('iris')
df_tips = sns.load_dataset('tips')

In [None]:
df_iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [None]:
df_tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

## Regression

In [None]:
def eval_regression(test, predict):
  # Mittlerer quadratischer Fehler (MSE) ausgeben
  print("MSE: %.2f" % mean_squared_error(test, predict))
  # R2 Koeffizient ausgeben
  print("R2-Score: %.2f" % r2_score(test, predict))

In [None]:
df_tips[['day', 'time']].value_counts()

day   time  
Sat   Dinner    87
Sun   Dinner    76
Thur  Lunch     61
Fri   Dinner    12
      Lunch      7
Thur  Dinner     1
dtype: int64

In [None]:
df_tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
# Tips feature engineering
df_tips = df_tips.replace({'Female': 1, 'Male': 0, 'Yes': 1, 'No': 0})
df_tips = df_tips.replace({'Thur': 1, 'Fri': 2, 'Sat': 3, 'Sun': 4})
df_tips = df_tips.replace({'Lunch': 0, 'Dinner': 1})
df_tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,1,0,4,1,2
1,10.34,1.66,0,0,4,1,3
2,21.01,3.5,0,0,4,1,3
3,23.68,3.31,0,0,4,1,2
4,24.59,3.61,1,0,4,1,4


In [None]:
# Datenset aufteilen
X_tips = df_tips.drop('tip', axis=1)
y_tips = df_tips['tip']

X_tips_train, X_tips_test, y_tips_train, y_tips_test = train_test_split(X_tips, y_tips, test_size=0.4, shuffle=True)

In [None]:
tips_clf_linear = linear_model.LinearRegression()
tips_clf_linear.fit(X_tips_train, y_tips_train)

predict_tips_linear = tips_clf_linear.predict(X_tips_test)

In [None]:
tips_clf_ridge = linear_model.Ridge()
tips_clf_ridge.fit(X_tips_train, y_tips_train)

predict_tips_ridge = tips_clf_ridge.predict(X_tips_test)

In [None]:
eval_regression(y_tips_test, predict_tips_linear)

MSE: 1.18
R2-Score: 0.32


In [None]:
eval_regression(y_tips_test, predict_tips_ridge)

MSE: 1.18
R2-Score: 0.32


## Klassifizierung

In [None]:
df_iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [None]:
df_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [None]:
X_iris = df_iris.drop(['species'], axis=1)
y_iris = df_iris['species']

X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size=0.4, shuffle=True)

In [None]:
iris_clf_knn = neighbors.KNeighborsClassifier(2, weights='uniform')
iris_clf_knn.fit(X_iris_train, y_iris_train)

predict_iris_knn = iris_clf_knn.predict(X_iris_test)

In [None]:
confusion_matrix(y_iris_test, predict_iris_knn)

array([[17,  0,  0],
       [ 0, 19,  2],
       [ 0,  1, 21]])

In [None]:
recall_score(y_iris_test, predict_iris_knn, average='macro')

0.9531024531024531

In [None]:
precision_score(y_iris_test, predict_iris_knn, average='macro')

0.9543478260869565

In [None]:
accuracy_score(y_iris_test, predict_iris_knn)

0.95