 # Titanic project

 The notebook demonstrates a complete supervised learning workflow: data preprocessing → feature selection → model training → prediction → evaluation. 

In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Data loading

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv')
df

## EDA on the loaded dataset

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
sb.pairplot(df, hue='Survived')

### Exercise

Replace the values of column Sex with numeric values and plot the correlation map for the dataframe

In [None]:
df_sex_mapped = df.copy()
df_sex_mapped.Sex = df_sex_mapped.Sex.map(lambda x: 0 if x.lower() == 'male' else 1, na_action='ignore')

df_sex_mapped.head()

### Exercise

What is the percentage of survived per Pclass ?

In [None]:
columns_to_keep = ['Pclass', 'Survived']

df_pclass_survived = df.copy()
df_pclass_survived = df_pclass_survived.drop([col for col in df.columns if col not in columns_to_keep], axis=1)
df_pclass_survived = df_pclass_survived.groupby('Pclass').aggregate('mean')

df_pclass_survived

### Exercise

Predict who survives

In [None]:
df_survival_prediction = df_sex_mapped.copy()

In [None]:
df_survival_corr = df_survival_prediction.corr(numeric_only=True)
sb.heatmap(df_survival_corr, annot=True)

In [None]:
model = RandomForestClassifier()

feature = ['Pclass', 'Sex', 'Fare']
x = df_survival_prediction[feature]
y = df_survival_prediction['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

### Exercise

Plot the distribution of Age and Fare using histograms

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(30, 10))

axes[0].hist(df.Age, bins=25)
axes[0].set_title('Age')

axes[1].hist(df.Fare, bins=25)
axes[1].set_title('Fare')