In [1]:
import numpy as np
import pandas as pd

# Fallback-Option: Rechne über dem Iris-Datensatz. 
# Aber eigentlich interessiert uns ja Titanic:
titanic = True

Dieses Notebook: Kleine Ausarbeitung von MLPC Rezept 11.3. Datensatz laden, skalieren, Logit-Regression anwenden.

Zu tun: Alter besser berechnen als stur eine 33.33 einzutragen. Also eignet sich dieses Notebook als Umgebung, um ein kleines Regressions-Projekt zur Alters-Abschätzung einzufügen.

Datensätze laden
----

### Fallback: Iris

Ergebnis des Ladens ist ein ```numpy ndarray```


In [2]:
# Load data 
from sklearn.datasets import load_iris
iris = load_iris()

# Create target vector and feature matrix
features, target = iris.data, iris.target

# Split into training and test set
from sklearn.model_selection import train_test_split

# Iris, mit np.ndarray
features_train_unscaled, features_test_unscaled, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
type(features_train_unscaled)

numpy.ndarray

### Titanic mit pd.read_csv laden

Ergebnis ist ein ```DataFrame```


In [3]:
# url = 'https://tinyurl.com/titanic-csv'
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'
df = pd.read_csv(url).drop(['Sex', 'Name'], axis=1)


In [4]:
df['PClassNum'] = df['PClass'].replace(["1st", "2nd", "3rd"], [1.,2.,3.])
# alternative Syntax:
# df['PClassNum'] = df['PClass'].replace({"1st":1, "2nd":2, "3rd":3, "*":2})

In [5]:
# prüfen: gibt es noch Werte, die wir nicht ersetzt haben?
df.PClassNum.value_counts()


3.0    711
1.0    322
2.0    279
*        1
Name: PClassNum, dtype: int64

In [6]:
# aha, in einer Reihe wurde ein "*" als Kennzeichen für NaN verwendet.
df['PClassNum'] = df['PClassNum'].replace(["*"], [2])
df.PClassNum.value_counts()

3.0    711
1.0    322
2.0    280
Name: PClassNum, dtype: int64

In [7]:
# gaanz viele NaN im Attribut Alter!
# als baseline möge die Trivial-Strategie genügen (später besser machen)
df['Age'] = df['Age'].replace(np.nan, 33.33)  


In [8]:
# Train-Test-Split Titanic
from sklearn.model_selection import train_test_split

# Titanic, mit DataFrame
titanic_train, titanic_test = train_test_split(df, test_size=0.2,  random_state=42)
titanic_train.head()

Unnamed: 0,PClass,Age,Survived,SexCode,PClassNum
140,1st,16.0,1,1,1.0
903,3rd,32.0,1,0,3.0
363,2nd,44.0,0,1,2.0
199,1st,33.33,0,0,1.0
994,3rd,33.33,0,0,3.0


In [9]:
if titanic:
  print("Titanic")  
  features_train_unscaled = titanic_train[['PClassNum', 'Age', 'SexCode']].values
  features_test_unscaled  = titanic_test [['PClassNum', 'Age', 'SexCode']].values
  target_train            = titanic_train['Survived'].values
  target_test             = titanic_test ['Survived'].values
    
if not titanic:
    # Fallback: Wir verwenden die Features aus dem Iris-Datensatz
    print("Iris")  
    features_train_unscaled, features_test_unscaled, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

Titanic


In [10]:
import pandas as pd
# targets=y: eine einzige Spalte von zugehörigen Klassen 0, 1 oder 2
y_train = pd.DataFrame(target_train)
y_test = pd.DataFrame(target_test)

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

In [11]:
features_train = scaler.fit_transform(features_train_unscaled)

In [12]:
features_train = scaler.fit_transform(features_train_unscaled)
features_test  = scaler.transform(features_test_unscaled)

# X: Features-Matrix, viele Zeilen von einzelnen Messungen
X_train_unscaled = pd.DataFrame(features_train_unscaled)
X_train = pd.DataFrame(features_train)
X_test = pd.DataFrame(features_test)


# X_train_unscaled.describe().round(1)
# hat die Skalierung funktioniert?
X_train.describe().round(1)

Unnamed: 0,0,1,2
count,1050.0,1050.0,1050.0
mean,-0.0,0.0,-0.0
std,1.0,1.0,1.0
min,-1.6,-2.9,-0.7
25%,-0.4,-0.5,-0.7
50%,0.8,0.2,-0.7
75%,0.8,0.2,1.4
max,0.8,3.6,1.4


In [13]:
X_train.head()

Unnamed: 0,0,1,2
0,-1.551499,-1.424282,1.355669
1,0.837173,0.031303,-0.737643
2,-0.357163,1.122991,1.355669
3,-1.551499,0.152298,-0.737643
4,0.837173,0.152298,-0.737643


Sklearn basiert auf Numpy, eigentlich ist kein Pandas erforderlich. Wir zeigen hier an einem Baseline-Classifier, wie man rein mit Sklearn einen Klassifizierer erstellt. Ein- und Ausgabe-Datenformat ist ein numpy.ndarray.



In [14]:
# Create (dummy) classifier
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
dummy.fit(features_train, target_train)

# unbekannte Exemplare klassifizieren: 
# Für jede Zeile in features_test wird eine Klasse errechnet. 
# Ergebnis ist ein Vektor.

#print(dummy.score(features_test, target_test), dummy.predict(features_test))


DummyClassifier(constant=None, random_state=1, strategy='uniform')

Data Wranglig machen wir aber genau genommen mit Pandas. Also bekommt auch Sklearn Pandas-Objekte, also ein pd.DataFrame als Eingabe.

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=1)
rfc.fit(X_train, y_train[0])
#print(rfc.score(X_test, y_test[0]), rfc.predict(X_test) )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [17]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
from sklearn import linear_model
logit = linear_model.LogisticRegressionCV(Cs=1000, cv=3, multi_class='ovr', solver='saga', max_iter=1000)
logit.fit(X_train, y_train[0])
#print(logit.score(X_test, y_test[0]), logit.predict(X_test))

LogisticRegressionCV(Cs=1000, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000,
           multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='saga', tol=0.0001, verbose=0)

In [18]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
from sklearn import linear_model
sgdEnet = linear_model.SGDClassifier(penalty='elasticnet', max_iter=1000, tol=1e-3)
sgdEnet.fit(X_train, y_train[0])


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [19]:
y_test['logit']=pd.Series(logit.predict(X_test))
y_test['rfc']= pd.Series(rfc.predict(X_test))
y_test['sgdEnet']= pd.Series(sgdEnet.predict(X_test))
y_test['dummy']=pd.Series(dummy.predict(X_test))

In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Get accuracy score
print("dummy:\n", 
      dummy.score(X_test, y_test[0]), '\n',
      confusion_matrix(y_test[0], y_test['dummy']))
print("rfc:\n", 
      rfc.score(X_test, y_test[0]), '\n', 
     confusion_matrix(y_test[0], y_test['rfc']))
print("logit:\n", 
      logit.score(X_test, y_test[0]), '\n',
     confusion_matrix(y_test[0], y_test['logit']))
print("sgdEnet:\n", 
      sgdEnet.score(X_test, y_test[0]), '\n',
     confusion_matrix(y_test[0], y_test['sgdEnet']))


dummy:
 0.467680608365019 
 [[77 91]
 [49 46]]
rfc:
 0.8060836501901141 
 [[153  15]
 [ 36  59]]
logit:
 0.8212927756653993 
 [[161   7]
 [ 40  55]]
sgdEnet:
 0.7414448669201521 
 [[124  44]
 [ 24  71]]


In [21]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
# from sklearn.metrics import classification_report
# y_true = [0, 1, 2, 2, 2]
# y_pred = [0, 0, 2, 2, 1]
# target_names = ['class 0', 'class 1', 'class 2']
# print(classification_report(y_true, y_pred, target_names=target_names))

print("rfc:\n", classification_report(y_test[0].values, y_test['rfc'].values))
print("logit:\n", classification_report(y_test[0].values, y_test['logit'].values))
print("sgdEnet:\n", classification_report(y_test[0].values, y_test['sgdEnet'].values))

rfc:
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       168
           1       0.80      0.62      0.70        95

   micro avg       0.81      0.81      0.81       263
   macro avg       0.80      0.77      0.78       263
weighted avg       0.81      0.81      0.80       263

logit:
               precision    recall  f1-score   support

           0       0.80      0.96      0.87       168
           1       0.89      0.58      0.70        95

   micro avg       0.82      0.82      0.82       263
   macro avg       0.84      0.77      0.79       263
weighted avg       0.83      0.82      0.81       263

sgdEnet:
               precision    recall  f1-score   support

           0       0.84      0.74      0.78       168
           1       0.62      0.75      0.68        95

   micro avg       0.74      0.74      0.74       263
   macro avg       0.73      0.74      0.73       263
weighted avg       0.76      0.74      0.75       