In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import mglearn

from IPython.display import display

%matplotlib inline

# Копирам си модела от лекцията

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[[self.key]]
    
class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        super().fit(X)

    def transform(self, X, y=None):
        return super().transform(X)

    def fit_transform(self, X, y=None):
        return super().fit(X).transform(X)
    
class StringImputer(TransformerMixin):
    def fit(self, X, *_):
        self.modes = X.mode().iloc[0]
        return self
        
    def transform(self, X, y=None):
        return X.fillna(self.modes)

In [19]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [37]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

model = Pipeline([
    ('union', FeatureUnion([
        ('age', Pipeline([
            ('select', ItemSelector('Age')),
            ('imputer', Imputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ])),
        ('gender', Pipeline([
            ('select', ItemSelector('Sex')),
            ('imputer', StringImputer()),
            ('encoder', LabelBinarizerPipelineFriendly()),
        ])),
        ('embarked', Pipeline([
            ('select', ItemSelector('Embarked')),
            ('imputer', StringImputer()),
            ('encoder', LabelBinarizerPipelineFriendly()),
        ])),
        ('sibsp', Pipeline([
            ('select', ItemSelector('SibSp')),
            ('scaler', StandardScaler()),
        ])),
        ('parch', Pipeline([
            ('select', ItemSelector('Parch')),
            ('scaler', StandardScaler()),
        ])),
    ])),
    ('classifier', SVC())
])

scores = cross_val_score(model, train, train['Survived'])
print(scores)
print(scores.mean())

[ 0.81144781  0.83164983  0.82491582]
0.822671156004


# Сега ще се опитам да го подобря..
## Ще пусна Grid Search върху 3 алгоритъма (SVC, LogisticRegression, RandomForest)

Използвам същата трансформация на данните

In [39]:
feature_union = FeatureUnion([
    ('age', Pipeline([
        ('select', ItemSelector('Age')),
        ('imputer', Imputer(strategy='mean')),
        ('scaler', StandardScaler()),
    ])),
    ('gender', Pipeline([
        ('select', ItemSelector('Sex')),
        ('imputer', StringImputer()),
        ('encoder', LabelBinarizerPipelineFriendly()),
    ])),
    ('embarked', Pipeline([
        ('select', ItemSelector('Embarked')),
        ('imputer', StringImputer()),
        ('encoder', LabelBinarizerPipelineFriendly()),
    ])),
    ('sibsp', Pipeline([
        ('select', ItemSelector('SibSp')),
        ('scaler', StandardScaler()),
    ])),
    ('parch', Pipeline([
        ('select', ItemSelector('Parch')),
        ('scaler', StandardScaler()),
    ])),
])

Разделям на трей и тест и ги трансформирам

In [43]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(train, train["Survived"], random_state=0)

transformer = feature_union.fit(X_train)
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

Пускам Грид Сърч над 3те алгоритъма.. Нека най-добрият победи.

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('classifier', SVC())
])

grid = [
    {
        'classifier': [SVC()],
        'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__max_features': [1, 2, 3, 5],
        'classifier__n_estimators': [10, 50, 100, 200]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
    }
]

In [48]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(pipeline, grid, cv=5)
search.fit(X_train_transformed, y_train)

print("Best params:\n{}\n".format(search.best_params_))
print("Best cross-validation score: {:.2f}".format(search.best_score_))

Best params:
{'classifier': SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 1, 'classifier__gamma': 1}

Best cross-validation score: 0.82


# Изглежда, че модела от лекцията е най-добрия от всички