In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [51]:
#Loading Dataset
Data=pd.read_csv('tested.csv')

In [52]:
# Shuffle the dataset to prevent any order-related bias
Data = Data.sample(frac=1, random_state=42).reset_index(drop=True)

In [53]:
#Basic Info
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [54]:
#Handeling missing values
Data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [55]:
Data['Age'].fillna(Data['Age'].median(), inplace= True)
Data['Fare'].fillna(Data['Fare'].median(), inplace= True) #filling fare with median
Data.drop(columns=['Name','Ticket','Cabin'], inplace=True) ##dropping cabin due to high missing values, name and ticket format we cant use like that so dropping that also

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Data['Age'].fillna(Data['Age'].median(), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Data['Fare'].fillna(Data['Fare'].median(), inplace= True) #filling fare with median


In [56]:
#Encoding categorical variables
label_enc= LabelEncoder()
Data['Sex']=label_enc.fit_transform(Data['Sex'])
Data=pd.get_dummies(Data, columns=['Embarked'], drop_first=True)

In [57]:
#Define feature and traget
X=Data.drop(columns=['Survived','PassengerId'])
y=Data['Survived']

In [58]:
#Normalize Numerical features
scaler= StandardScaler()
numeric_features=['Age','Fare','Pclass','SibSp','Parch']
X[numeric_features]= scaler.fit_transform(X[numeric_features])

In [59]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [60]:
# Train model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

In [61]:
# Perform K-Fold Cross-Validation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(rf_model, X, y, cv=kf, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cross_val_scores))

Cross-Validation Accuracy Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Accuracy: 1.0


In [62]:
# Train a Logistic Regression model as a baseline
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, y_pred_log)
print("Logistic Regression Accuracy:", log_accuracy)

Logistic Regression Accuracy: 1.0


In [38]:
# Predictions
y_pred = rf_model.predict(X_test)

In [39]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [40]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[53  0]
 [ 0 31]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        31

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [64]:
# Create DataFrame for PassengerId and Survival Prediction
results = pd.DataFrame({'PassengerId': Data.loc[X_test.index, 'PassengerId'], 'Survived': y_pred})
results.to_csv('titanic_predictions.csv', index=False)

In [65]:
results #the result showing data for 20% of the data we used testing

Unnamed: 0,PassengerId,Survived
406,979,1
352,945,1
342,1195,0
189,984,1
91,1281,0
...,...,...
211,1225,1
190,961,1
360,1231,0
304,1184,0


In [66]:
full_predictions = rf_model.predict(X)  # Predict for the entire dataset
Results = pd.DataFrame({'PassengerId': Data['PassengerId'], 'Survived': full_predictions})
Results.to_csv('titanic_predictions_full.csv', index=False)

In [68]:
Results

Unnamed: 0,PassengerId,Survived
0,1213,0
1,1216,1
2,1280,0
3,948,0
4,1045,1
...,...,...
413,963,0
414,998,0
415,1162,0
416,1240,0
