In [None]:
import pandas as pd
import numpy as np 
import re 
import sklearn
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:

df = pd.read_csv('archive/Titanic-Dataset.csv')
print(df.head())

In [None]:
print(df.info())

In [None]:
df.columns.values


In [None]:
df['Survived'].value_counts()

In [None]:
survived = 'survived'
not_survived = 'not survived'

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

women = df[df['Sex'] == 'female']
men = df[df['Sex'] == 'male']

ax = sns.histplot(women[women['Survived'] == 1].Age.dropna(), bins=18, label=survived, ax=axes[0], kde=False)
ax = sns.histplot(women[women['Survived'] == 0].Age.dropna(), bins=40, label=not_survived, ax=axes[0], kde=False)

# Manually create legend labels for the first subplot
ax.legend(title="Survival Status", labels=[survived, not_survived])

ax.set_title('Female')

ax = sns.histplot(men[men['Survived'] == 1].Age.dropna(), bins=18, label=survived, ax=axes[1], kde=False)
ax = sns.histplot(men  [men['Survived'] == 0].Age.dropna(), bins=40, label=not_survived, ax=axes[1], kde=False)

# Manually create legend labels for the second subplot
ax.legend(title="Survival Status", labels=[survived, not_survived])

_ = ax.set_title('Male')


In [None]:
total = df.isnull().sum().sort_values(ascending=False)
miss_val=df.isnull().sum()/df.isnull().count()*100
miss_val2=(round(miss_val,1)).sort_values(ascending=False)
missing_data=pd.concat([total,miss_val2],axis=1)
missing_data.columns=['total','%']
missing_data.head(5)

In [None]:
df.info()
print(df.shape)

In [None]:
# Correlation analysis
correlation = df[['Age', 'Fare']].corr()
print(correlation)

# Cross-tabulation
cross_tab = pd.crosstab(df['Pclass'], df['Survived'])
print(cross_tab)

Data Preprocessing

In [None]:
df=df.drop(['PassengerId'],axis=1)
df=df.drop(['Cabin'],axis=1)

In [None]:
df['Embarked'].describe()
df['Embarked'].head(50)

In [None]:
common_value='S'
df['Embarked']=df['Embarked'].fillna(common_value)
df['Embarked'].head(50)

In [None]:
ports = {"S": 0, "C": 1, "Q": 2}

df['Embarked']= df['Embarked'].map(ports)

df['Embarked'].info()
df.head()

In [None]:
df['Fare']=df['Fare'].fillna(0)
df['Fare']=df['Fare'].astype(int)
df.info()

In [None]:
#convert 'Sex into numeric value
genders={'male':0,'female':1}
df['Sex']= df['Sex'].map(genders)

In [None]:
for age in df:
    mean=df['Age'].mean()
    std=df['Age'].std()
    is_null=df['Age'].isnull().sum()
#fill random values in age 
    random_age=np.random.randint(mean-std,mean+std,size=is_null)
    age_slice=df['Age'].copy()
    age_slice[np.isnan(age_slice)]=random_age
    df["Age"]=age_slice
print(mean,std,random_age)

In [None]:

df['age_category']= df.Age.map(lambda age:'Kid'if age <18 else "Adult")
df.head(10)

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
miss_val=df.isnull().sum()/df.isnull().count()*100
miss_val2=(round(miss_val,1)).sort_values(ascending=False)
missing_data=pd.concat([total,miss_val2],axis=1)
missing_data.columns=['total','%']
missing_data.head(5)

In [None]:
df['Ticket'].describe()
df=df.drop(['Ticket'],axis=1)

In [None]:
df['SibSp'].head()

In [None]:
feature = ['Pclass','Sex','Age','SibSp','Parch']
X=df[feature]
Y=df['Survived']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=4)
model.fit(X_train, Y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(Y_test, Y_pred))


In [None]:
# Example: Grid Search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=4), param_grid, cv=5)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
print(best_model)


In [None]:

Y_pred = best_model.predict(X_test)
accuracy2 = accuracy_score(Y_test, Y_pred)
print(f'Accuracy: {accuracy2:.2f}')
print(classification_report(Y_test, Y_pred))

In [None]:
# Example: Making predictions on new data
new_passenger_data = pd.DataFrame([[3, 0, 25, 0, 0]], columns=feature) 
prediction = best_model.predict(new_passenger_data)
print(f'Predicted Survival: {prediction[0]}')
