In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


In [2]:
def load_data():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    return train, test

train_df, test_df = load_data()
print(train_df.head())
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
def handle_missing_values(dataframe):
    # Impute missing values for 'Age' and 'Fare' with median
    imputer = SimpleImputer(strategy='median')
    dataframe['Age'] = imputer.fit_transform(dataframe[['Age']])
    dataframe['Fare'] = imputer.fit_transform(dataframe[['Fare']])

    # Impute missing values for 'Embarked' with mode
    dataframe['Embarked'].fillna(dataframe['Embarked'].mode()[0], inplace=True)

    # Drop the 'Cabin' column due to high percentage of missing values
    if 'Cabin' in dataframe.columns:
        dataframe.drop('Cabin', axis=1, inplace=True)

    return dataframe

train_df = handle_missing_values(train_df)
test_df = handle_missing_values(test_df)
print(train_df.isnull().sum())
print(test_df.isnull().sum())


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [4]:
def encode_categorical(dataframe):
    # Encode 'Sex' column
    dataframe['Sex'] = dataframe['Sex'].map({'male': 0, 'female': 1})

    # One-hot encode 'Embarked' column
    dataframe = pd.get_dummies(dataframe, columns=['Embarked'], drop_first=True)

    return dataframe

train_df = encode_categorical(train_df)
test_df = encode_categorical(test_df)
print(train_df.head())
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500       False        True  
1          PC 17599  71.2833       False       False  
2  STON/O2. 3101282   7.9250       False        True  
3            113803  53.1000       False        True  
4            373450  

In [5]:
def generate_features(dataframe):
    # Create a new feature 'FamilySize'
    dataframe['FamilySize'] = dataframe['SibSp'] + dataframe['Parch'] + 1

    # Create a new feature 'IsAlone'
    dataframe['IsAlone'] = 1
    dataframe['IsAlone'].loc[dataframe['FamilySize'] > 1] = 0

    # Create a new feature 'Title' from 'Name'
    dataframe['Title'] = dataframe['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    # Simplify titles
    dataframe['Title'] = dataframe['Title'].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr',
                                                     'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataframe['Title'] = dataframe['Title'].replace('Mlle', 'Miss')
    dataframe['Title'] = dataframe['Title'].replace('Ms', 'Miss')
    dataframe['Title'] = dataframe['Title'].replace('Mme', 'Mrs')
    dataframe = pd.get_dummies(dataframe, columns=['Title'], drop_first=True)

    return dataframe

train_df = generate_features(train_df)
test_df = generate_features(test_df)
print(train_df.head())
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare  Embarked_Q  Embarked_S  FamilySize  IsAlone  \
0         A/5 21171   7.2500       False        True           2        0   
1          PC 17599  71.2833       False       False           2        0   
2  STON/O2. 3101282   7.9250       False        True           1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['IsAlone'].loc[dataframe['FamilySize'] > 1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['IsAlone'].loc[dataframe['FamilySize'] > 1] = 0


In [6]:
def scale_numerical(train_df, test_df):
    scaler = StandardScaler()
    numerical_cols = ['Age', 'Fare', 'FamilySize']

    # Fit scaler on training data and transform both train and test data
    train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

    return train_df, test_df

train_df, test_df = scale_numerical(train_df, test_df)
print(train_df.head())
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex       Age  SibSp  \
0                            Braund, Mr. Owen Harris    0 -0.565736      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  0.663861      1   
2                             Heikkinen, Miss. Laina    1 -0.258337      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  0.433312      1   
4                           Allen, Mr. William Henry    0  0.433312      0   

   Parch            Ticket      Fare  Embarked_Q  Embarked_S  FamilySize  \
0      0         A/5 21171 -0.502445       False        True    0.059160   
1      0          PC 17599  0.786845       False       False    0.059160   
2      0  STON/O2. 3101282 -0.488854       False        True   -0.560975   
3      0  

In [7]:
def prepare_data_for_modeling(train_df, test_df):
    # Drop columns that won't be used for prediction
    train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    test_df.drop(['Name', 'Ticket'], axis=1, inplace=True)

    # Align test dataset with train dataset
    test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

    # Separate features and target
    X_train = train_df.drop('Survived', axis=1)
    y_train = train_df['Survived']

    X_test = test_df.drop('Survived', axis=1)  # Test set doesn't have 'Survived', but it's aligned

    return X_train, X_test, y_train

X_train, X_test, y_train = prepare_data_for_modeling(train_df, test_df)
print(X_train.shape, X_test.shape, y_train.shape)


(891, 14) (418, 14) (891,)
