## Laden der Daten

In [118]:
import requests

import pandas as pd

In [119]:
data = pd.read_html("http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights#Data_Table")

In [120]:
data = data[1]

In [121]:
data.head()

Unnamed: 0,Name,Team,Position,Height(inches),Weight(pounds),Age
0,Adam_Donachie,BAL,Catcher,74,180.0,22.99
1,Paul_Bako,BAL,Catcher,74,215.0,34.69
2,Ramon_Hernandez,BAL,Catcher,72,210.0,30.78
3,Kevin_Millar,BAL,First_Baseman,72,210.0,35.43
4,Chris_Gomez,BAL,First_Baseman,73,188.0,35.71


In [122]:
data.columns

Index(['Name', 'Team', 'Position', 'Height(inches)', 'Weight(pounds)', 'Age'], dtype='object')

## Erkunden der Daten

In [123]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1034 entries, 0 to 1033
Data columns (total 6 columns):
Name              1034 non-null object
Team              1034 non-null object
Position          1034 non-null object
Height(inches)    1034 non-null int64
Weight(pounds)    1033 non-null float64
Age               1034 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 48.6+ KB


In [124]:
data.isna().any()

Name              False
Team              False
Position          False
Height(inches)    False
Weight(pounds)     True
Age               False
dtype: bool

## Impute NAN's

In [125]:
data["Weight(pounds)"] = data["Weight(pounds)"].fillna(data["Weight(pounds)"].mean())

## Umbennen der Spalten

In [126]:
data = data.drop(["Name"], axis = 1)

In [127]:
data = data.rename(columns={"Height(inches)": "Height", "Weight(pounds)": "Weight"})

In [128]:
data["Height"] = round(data.Height * 2.54, 2)
data["Weight"] = round(data.Weight / 2.205, 2)

## Umwandeln der Spalten in cm bzw. kg

In [129]:
data.head()

Unnamed: 0,Team,Position,Height,Weight,Age
0,BAL,Catcher,187.96,81.63,22.99
1,BAL,Catcher,187.96,97.51,34.69
2,BAL,Catcher,182.88,95.24,30.78
3,BAL,First_Baseman,182.88,95.24,35.43
4,BAL,First_Baseman,185.42,85.26,35.71


## Train Test Split und Auswahl der Features

In [146]:
from sklearn.model_selection import train_test_split

In [184]:
input_features = [
       'Height', 'Weight', 'Age' 
]

output_features = [
    'Position'
]

X_train, X_test, y_train, y_test = train_test_split(
    data[input_features],
    data[output_features]
)

## Spalte Position encoden

In [182]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing

## Eigene Selectoren erstellen

In [175]:
class ColumnSelector:
    
    def __init__(self, select_numeric=True):
        self.select_numeric = select_numeric
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.select_numeric:
            return X.select_dtypes(include=["number"])
        elif not self.select_numeric:
            return X.select_dtypes(exclude=["number"])
        

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class MyLEncoder():

    def transform(self, X, y=None, **fit_params):
        enc = preprocessing.LabelEncoder()
        encc = enc.fit(X)
        enc_data = enc.transform(X)

        return enc_data

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [176]:
num_attribs = data[["Age", "Height", "Weight"]]
cat_attribs = data[["Position", "Team"]]

## Pipelines für kategorische und numerische Variablen erstellen

In [190]:
num_pipeline = Pipeline([
    ('selector', ColumnSelector(select_numeric= True)),
    ('robust_scaler', RobustScaler()),
])

cat_pipeline = Pipeline([
    ('selector', ColumnSelector(select_numeric= False)),
    ('label_encoder', MyLEncoder())
])

In [178]:
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

## Numerische Daten skalieren

In [194]:
num_pipeline.fit(X_train)
X_train_p = num_pipeline.transform(X_train)
X_test_p = num_pipeline.transform(X_test)



## Kategorische Variablen encoden

In [201]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
y_train_p = label_encoder.transform(y_train.values.ravel())
label_encoder.fit(y_test.values.ravel())
y_test_p = label_encoder.transform(y_test.values.ravel())

## Klassifikation Logistischer Regression

### Es wird versucht anhand von Größe, Gewicht und Alter die Position zu prognostizieren

In [203]:
from sklearn.linear_model import LogisticRegression

In [206]:
log_model = LogisticRegression(multi_class="multinomial", solver="newton-cg")
log_model.fit(X_train_p, y_train_p)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [207]:
log_prediction = log_model.predict(X_test_p)

In [208]:
log_model.score(X_test_p, y_test_p)

0.305019305019305

## Klassifikation Random Forest

In [210]:
from sklearn.ensemble import RandomForestClassifier

In [226]:
rf_model = RandomForestClassifier(max_depth=5, random_state= 191, n_estimators=10)
rf_model.fit(X_train_p, y_train_p)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=191,
                       verbose=0, warm_start=False)

In [227]:
rf_prediction = rf_model.predict(X_test_p)

In [228]:
rf_model.score(X_test_p, y_test_p)

0.3204633204633205

## KNN Classifier

In [229]:
from sklearn.neighbors import KNeighborsClassifier

In [243]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train_p, y_train_p)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [244]:
knn_prediction = knn_model.predict(X_test_p)

In [245]:
knn_model.score(X_test_p, y_test_p)

0.29343629343629346

## Decision Tree Classifier

In [246]:
from sklearn.tree import DecisionTreeClassifier

In [254]:
decision_tree = DecisionTreeClassifier(max_depth= 3, min_samples_leaf= 5)
decision_tree.fit(X_train_p, y_train_p)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [255]:
decision_tree_predict = decision_tree.predict(X_test_p)

In [256]:
decision_tree.score(X_test_p, y_test_p)

0.30115830115830117

Wie es zu erwarten war, sind die 3 Features zu wenig, um ein gutes Klassikfikationsergebnis zu erhalten. Um eine bessere Accuracy zu erhalten wären weitere Input-Features notwendig gewesen.