In [None]:
#importing all essential libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


#loading train & test datasets from kaggle through CSV
train = pd.read_csv("traintitanic.csv")
test = pd.read_csv("testtitanic.csv")

#print(train.head())
#print(train.info())
#print(train.isnull().sum())
#print(train.nunique())

#Handling missing values by using imputer
imputer1=SimpleImputer(strategy='most_frequent')
train[['Embarked']]=imputer1.fit_transform(train[['Embarked']])
test[['Embarked']]=imputer1.fit_transform(test[['Embarked']])

imputer2=SimpleImputer(strategy='mean')
train[['Age']]=imputer2.fit_transform(train[['Age']])
test[['Age']]=imputer2.fit_transform(test[['Age']])


#creating new column data Title from Name

train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_map = {'Mr':'Mr','Miss':'Miss','Mrs':'Mrs','Master':'Master','Dr':'Rare',
             'Rev':'Rare','Col':'Rare','Major':'Rare','Mlle':'Miss','Countess':'Rare',
             'Ms':'Miss','Lady':'Rare','Jonkheer':'Rare','Don':'Rare','Dona':'Rare',
             'Capt':'Rare','Sir':'Rare'}
train['Title'] = train['Title'].map(title_map)
test['Title'] = test['Title'].map(title_map)

#creating family size using SibSp & Parch

for df in [train, test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

#Removing unneccessary columns
drop_cols = ['PassengerId','Name','Ticket','Cabin']
train = train.drop(columns=drop_cols)
test_data = test.drop(columns=drop_cols)

cat_cols = ['Sex','Embarked','Title']

#Encoding categorical columns
encoder = LabelEncoder()
for col in cat_cols:
    train[col] = encoder.fit_transform(train[col])
    test_data[col] = encoder.transform(test_data[col])

X = train.drop(columns=['Survived'])
y = train['Survived']

#splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#scaling data for normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
test_scaled = scaler.transform(test_data)

#Model Training using RandomForestClassifier
print("Model Training using RandomForestClassifier:")
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)

#Evaluating model performance before Tuning
print("Accuracy before tuning rf:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print()
#Hyperparameter Tuning
params = {'n_estimators':[50,100]
          ,'max_depth':[3,5,]}

grid1 = GridSearchCV(rf, params, cv=5, scoring='accuracy')
grid1.fit(X_train_scaled, y_train)

print("Best Params:", grid1.best_params_)
best_model = grid1.best_estimator_

y_tune_pred=grid1.predict(X_test_scaled)
#Evaluating model performance after tuning
print("Accuracy after tuning rf:", accuracy_score(y_test, y_tune_pred))
print(confusion_matrix(y_test,y_tune_pred))
print(classification_report(y_test, y_tune_pred))
print()
print()

#Model Training using LogisticRegression
print("Model Training using LogisticRegression:")
LR = LogisticRegression(random_state=42)
LR.fit(X_train_scaled, y_train)
y_pred = LR.predict(X_test_scaled)

#Evaluating model performance before Tuning
print("Accuracy before tuning LR:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print()
#Hyperparameter Tuning
params = {"penalty":["l1","l2"],
          "C":[0.1,1],
          "solver":["liblinear","saga"]}

grid2 = GridSearchCV(LR, params, cv=5, scoring='accuracy')
grid2.fit(X_train_scaled, y_train)

print("Best Params:", grid2.best_params_)
best_model = grid2.best_estimator_

y_tune_pred=grid2.predict(X_test_scaled)
#Evaluating model performance after tuning
print("Accuracy after tuning LR:", accuracy_score(y_test, y_tune_pred))
print(confusion_matrix(y_test,y_tune_pred))
print(classification_report(y_test, y_tune_pred))
print()
print()

print("Model Training using XGboost:")
XGB = XGBClassifier(random_state=42)
XGB.fit(X_train_scaled, y_train)
y_pred = XGB.predict(X_test_scaled)

#Evaluating model performance before Tuning
print("Accuracy before tuning XGB:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print()
#Hyperparameter Tuning
params = {"n_estimators":[50,100],
          "learning_rate":[0.1,1],
          "max_depth":[3,5]}

grid3 = GridSearchCV(XGB, params, cv=5, scoring='accuracy')
grid3.fit(X_train_scaled, y_train)

print("Best Params:", grid3.best_params_)
best_model = grid3.best_estimator_

y_tune_pred=grid3.predict(X_test_scaled)
#Evaluating model performance after tuning
print("Accuracy after tuning LR:", accuracy_score(y_test, y_tune_pred))
print(confusion_matrix(y_test,y_tune_pred))
print(classification_report(y_test, y_tune_pred))
print()
print()
new_data = pd.DataFrame({
    'Pclass':[3],
    'Name':["John Smith"],
    'Sex':["male"],
    'Age':[28],
    'SibSp':[0],
    'Parch':[0],
    'Ticket':["A/5 21171"],
    'Fare':[7.25],
    'Cabin':[None],
    'Embarked':["S"]
})

# Extract Title
new_data['Title'] = new_data['Name'].str.extract(' ([A-Za-z]+)\.')
title_map = {'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master'}
new_data['Title'] = new_data['Title'].map(title_map).fillna('Mr')
# Create FamilySize
new_data['FamilySize'] = new_data['SibSp'] + new_data['Parch'] + 1

# Drop unnecessary columns
new_data = new_data.drop(columns=['Name','Ticket','Cabin'])

# Encode
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['Sex','Embarked','Title']:
    new_data[col] = encoder.fit_transform(new_data[col])



X_new_scaled = scaler.transform(new_data)
prediction =rf.predict(X_new_scaled)

print("Predicted Survival:", prediction[0])  # 1 = Survived, 0 = Not Survive

#Survival vs Age
plt.figure(figsize=(6,4))
sns.histplot(data=train,x="Age",hue="Survived",bins=20)
plt.title("Survival across age")
plt.xlabel("Age")
plt.ylabel("count")
plt.savefig("survival_vs_age.png")
plt.show()

#survival vs gender
plt.figure(figsize=(6,4))
sns.countplot(data=train,x="Sex",hue="Survived")
plt.title("Survival count by gender")
plt.xlabel("sex")
plt.ylabel("count")
plt.savefig("survival_vs_gender.png")
plt.show()

#Survival vs PClass
plt.figure(figsize=(6,4))
sns.countplot(data=train,x="Pclass",hue="Survived")
plt.title("Survival count by Pclass")
plt.xlabel("Pclass")
plt.ylabel("count")
plt.savefig("survival_vs_Pclass.png")
plt.show()

#Age vs Survival
plt.figure(figsize=(6,4))
sns.violinplot(data=train,x="Age",hue="Survived",split=True)
plt.title("Survival across age")
plt.xlabel("Age")
plt.ylabel("count")
plt.savefig("age_vs_survival.png")
plt.show()

from google.colab import files
files.download('survival_vs_age.png')
files.download('survival_vs_gender.png')
files.download('survival_vs_Pclass.png')
files.download('age_vs_survival.png')