In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

path = '../data/train.csv' 
train_data = pd.read_csv(path)

# Mount of samples
print(f"{train_data.shape} \n\n")

# Deleting useless features
train_data = train_data.drop(columns=['Name','Ticket'])

# Checking which columns have empty values
print(f"{train_data.isnull().sum()} \n\n") 

# Because of most samples in the Cabin are missing we decide to delete column 'Cabin'ArithmeticError
train_data = train_data.drop(columns=['Cabin'])

# Filling Age columns with median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Filling Embarked with the mode (dominanta)
train_data['Embarked'].fillna(train_data['Embarked'].mode(), inplace=True) 

# Mapping Sex
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})

# Using one-hot encoding on Embarked
train_data = pd.get_dummies(train_data, columns=['Embarked'], drop_first=True) 
"""
    It creates new binary columns for each unique value in the categorical columns
    In other words I create columns with names: C, Q, S (in this case) and binary values inside of them 
    
    drop first: Prevents multicollinearity (a situation where one feature can be predicted from others, which can cause issues in some machine learning models).
    A passenger who boarded at Cherbourg (C) will have 0 in both Embarked_Q and Embarked_S.
    A passenger who boarded at Queenstown (Q) will have 1 in Embarked_Q and 0 in Embarked_S.
    A passenger who boarded at Southampton (S) will have 0 in Embarked_Q and 1 in Embarked_S."
"""

# Normalizing Fare 
train_data['Fare'] = (train_data['Fare'] - train_data['Fare'].mean()) / train_data['Fare'].std() # Syntax: (value_of_sample - mean_of_samples) / samples standard deviation

print(train_data)
print('\n\n')

X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)



(891, 12) 


PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64 


     PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch      Fare  \
0              1         0       3    1  22.0      1      0 -0.502163   
1              2         1       1    0  38.0      1      0  0.786404   
2              3         1       3    0  26.0      0      0 -0.488580   
3              4         1       1    0  35.0      1      0  0.420494   
4              5         0       3    1  35.0      0      0 -0.486064   
..           ...       ...     ...  ...   ...    ...    ...       ...   
886          887         0       2    1  27.0      0      0 -0.386454   
887          888         1       1    0  19.0      0      0 -0.044356   
888          889         0       3    0  28.0      1      2 -0.176164   
889          890         1       1    1  26.0      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode(), inplace=True)
