In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
def read_csv (path):
    train_data = pd.read_csv(path)
    return train_data

train_data =  read_csv('train.csv')
train_data_num = train_data.drop(columns=['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])
train_data_text = train_data.drop(columns=['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])
train_data_target = train_data['Survived']
corr_matrix = train_data_num.corr()

train_data_num.hist(figsize=(10, 10), bins=30)
plt.show()
print()

In [58]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OrdinalEncoder

num_attributes = list(train_data_num)
text_attributes = list(train_data_text)

num_pipeline = Pipeline([  
    ('df_selector', DataFrameSelector(num_attributes)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), 
    ('scaler', StandardScaler())
])

text_pipeline = Pipeline([
    ('df_selector', DataFrameSelector(text_attributes)),
    ('impute_text', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('text_pipeline', text_pipeline)
])

train_data_prepared = full_pipeline.fit_transform(train_data)

In [60]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,accuracy_score

sgd_reg = SGDRegressor(penalty='l2', max_iter=1000, tol=1e-3, random_state=42,
                       alpha=0.0001, learning_rate='invscaling', eta0=0.01,early_stopping=True,)

scores = cross_val_score(sgd_reg, train_data_prepared, train_data_target, 
                         scoring='neg_mean_squared_error', cv=5)
sgd_scores = np.sqrt(-scores)
print("SGD Regressor scores:", sgd_scores)


SGD Regressor scores: [2.15415613e+14 4.21250625e+14 2.08517648e+14 5.34701501e+14
 7.43674731e+13]
