In [8]:
# Loading the dataset
import pandas as pd
train_df=pd.read_csv('../data/train.csv')
test_df=pd.read_csv('../data/test.csv')
gender_df=pd.read_csv('../data/gender_submission.csv')


In [9]:
# Understanding hte data
train_df.head()
# First 5 rows of the data
train_df.info()
# Information about the data
train_df.describe()
# Statistical information about the data
train_df.isnull().sum()
# Number of null values in the data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
test_df.head()
test_df.describe()
test_df.info()
test_df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
# Filling the age with median
# train_df['Age'].fillna(train_df['Age'].median(),inplace=True)
train_df.fillna({'Age':train_df['Age'].median()},inplace=True)
# test_df['Age'].fillna(test_df['Age'].median(),inplace=True)
test_df.fillna({'Age':test_df['Age'].median()},inplace=True)
# Filling the embarked with mode
# train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)
# test_df['Embarked'].fillna(test_df['Embarked'].mode()[0],inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0],inplace=True)
# Filling the fare with median
# train_df['Fare'].fillna(train_df['Fare'].median(),inplace=True)
train_df['Fare'].fillna(train_df['Fare'].median(),inplace=True)
# inplace true means it will change the original data




In [17]:
# Dropping irrrelevant columns
train_df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
test_df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)


In [18]:
# Encode categotical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['Sex', 'Embarked']:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
# label encoder is used for encoding categorical variables
# This snippet basically converts categorical variables to numerical variables like male to 1 and female to 0




In [19]:
# Feature engineering (optional)
# Family Size Feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']



In [20]:
# Train split test is used for training and testing the model
from sklearn.model_selection import train_test_split
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# Training of the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
# sklearn.ensemble is used for machine learning algorithms
# sklearn.metrics is used for evaluating the performance of a classification model


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
joblib.dump(model, '../models/titanic_model.pkl') 
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


Validation Accuracy: 0.8100558659217877


In [24]:

# Predict on the test
X_test = test_df.drop('PassengerId', axis=1)
predictions = model.predict(X_test)


In [25]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)
