In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**LOADING DATA**

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

**DATA PREPROCESSING**

In [None]:
train_data.drop(['Cabin'],axis=1,inplace=True)

In [None]:
test_data.drop(['Cabin'],axis=1,inplace=True)

In [None]:
imputer = SimpleImputer(strategy='median')

In [None]:
train_data['Age'] = imputer.fit_transform(train_data[['Age']])
test_data['Age'] = imputer.fit_transform(test_data[['Age']])

In [None]:
train_data['Sex'] = train_data['Sex'].map({'male':0,'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0,'female':1})

In [None]:
train_data = pd.get_dummies(train_data, columns=["Embarked"])
test_data = pd.get_dummies(test_data, columns=["Embarked"])

**FEATURE EXTRACTION**

In [None]:
def extract_title(name):
    return name.split(",")[1].split(".")[0].strip()

In [None]:
train_data['Title'] = train_data['Name'].apply(extract_title)
test_data['Title'] = test_data['Name'].apply(extract_title)

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

In [None]:
train_data["AgeGroup"] = pd.cut(train_data["Age"], bins=[0, 18, 30, 50, float("inf")], labels=["Child", "Young", "Adult", "Elderly"])
test_data["AgeGroup"] = pd.cut(test_data["Age"], bins=[0, 18, 30, 50, float("inf")], labels=["Child", "Young", "Adult", "Elderly"])

In [None]:
train_data["FareGroup"] = pd.qcut(train_data["Fare"], q=4, labels=["Low", "Medium", "High", "Very High"])
test_data["FareGroup"] = pd.qcut(test_data["Fare"], q=4, labels=["Low", "Medium", "High", "Very High"])

**TRAIN-TEST SPLIT**

In [None]:
X = train_data.drop(["PassengerId", "Survived", "Name", "Ticket","Title","AgeGroup","FareGroup"],axis=1)
Y = train_data['Survived']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

**MODEL SELECTION**

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train,Y_train)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
print("Validation Accuracy:", accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

**FINAL MODEL AFTER HYPERPARAMETER TUNING**

In [None]:
final_model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
final_model.fit(X_train, Y_train)

In [None]:
test_data.dropna(inplace=True)

In [None]:
X_test = test_data.drop(["PassengerId", "Name", "Ticket","Title","AgeGroup","FareGroup"], axis=1)
y_pred_test = final_model.predict(X_test)

submission = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": y_pred_test})
submission.to_csv("submission.csv", index=False)