In [None]:
import numpy as np
import pandas as pd

df=pd.read_csv('train.csv')

In [None]:
# this will tell survival percentage
survival_rate = df.Survived.mean()*100
print(f'percentage of the suvivor: {survival_rate:.2}% ')

In [None]:
# AVerage Age of the passenger
average_age= df.Age.mean()
print(f'average age of the passenger : {average_age:.2}')

In [None]:
df['ischild']=df['Age']>18
df['ischild'].value_counts()

In [None]:
# Group by Sex and it will calculate the mean of Survived
print(df.groupby('Sex')['Survived'].mean())

# Group by IsChild to see if children really had a higher chance
print(df.groupby('ischild')['Survived'].mean())

In [None]:
# Calculate the median age
median_age = df['Age'].median()

# Fill the missing values in the Age column
df['Age'] = df['Age'].fillna(median_age)

# Verify that there are no more nulls in Age
print(df['Age'].isnull().sum())

In [None]:
# Creating Pivot Table
pivot = df.pivot_table(index='Pclass', columns='Sex', values='Survived', aggfunc='mean')
print(pivot)

In [None]:
# Create FamilySize: Siblings + Parents + Self
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Let's see the survival rate by family size
print(df.groupby('FamilySize')['Survived'].mean())

In [None]:
# Check if the name contains 'Master' or 'Dr'
df['IsMaster'] = df['Name'].str.contains('Master')
df['IsDr'] = df['Name'].str.contains('Dr\.')

print("Survival rate for 'Master':")
print(df.groupby('IsMaster')['Survived'].mean())

print("\nSurvival rate for 'Doctor':")
print(df.groupby('IsDr')['Survived'].mean())

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Visualize survival by Class
survival_by_class = df.groupby('Pclass')['Survived'].mean()

# we are now Creating the plot
survival_by_class.plot(kind='bar', color=['darkred', 'indianred', 'lightcoral'])


plt.title('Survival Rate by Ticket Class')
plt.xlabel('Class (1 = Elite, 3 = Economy)')
plt.ylabel('Survival Probability')
plt.xticks(rotation=0)


plt.show()

In [None]:
# Extract the first letter of the Cabin
df['Deck'] = df['Cabin'].str[0]

# Fill missing Decks with 'U'
df['Deck'] = df['Deck'].fillna('U')

# See if certain decks had better survival rates
print(df.groupby('Deck')['Survived'].mean())

In [None]:
# Find the most common port
common_port = df['Embarked'].mode()[0]

# Fill the gaps
df['Embarked'] = df['Embarked'].fillna(common_port)

print(f"Filled missing ports with: {common_port}")

In [None]:
# Define the bins and labels
bins = [0, 12, 18, 35, 60, 100]
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']

# Create the AgeGroup column
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Check survival by Age Group
print(df.groupby('AgeGroup')['Survived'].mean())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df)
plt.title('Survival Rate by Class and Sex')
plt.show()

In [None]:
# Select only numerical columns for correlation
numerical_df = df.select_dtypes(include=['number'])

# Calculate correlation
corr_matrix = numerical_df.corr()

# Visualize it with a Heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Titanic Features')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(df[df['Survived'] == 1]['Fare'], label='Survived', fill=True)
sns.kdeplot(df[df['Survived'] == 0]['Fare'], label='Perished', fill=True)
plt.title('Fare Distribution by Survival')
plt.xlabel('Fare')
plt.xlim(0, 300) # Limiting to 300 to see the detail better
plt.legend()
plt.show()

In [None]:
# Convert Sex into 0 and 1 (Female=0, Male=1 usually, or vice versa)
# We can use pd.get_dummies to create separate columns for each category
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

print("New columns after encoding:")
print(df_encoded.columns)

In [None]:
# Save the cleaned and featured-engineered dataframe
df_encoded.to_csv('titanic_cleaned_day3.csv', index=False)
print("Cleaned data saved as titanic_cleaned_day3.csv!")

In [None]:
df=pd.read_csv('titanic_cleaned_day3.csv')

In [None]:
# Select only the columns that are numbers and useful
# We use the encoded dataframe from Day 3
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S']
X = df_encoded[features]
y = df_encoded['Survived']

# Fill any remaining NaNs (ML models can't handle them)
X = X.fillna(X.median())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model = DecisionTreeClassifier(max_depth=3) # Depth 3 keeps it simple to start

# Train the model
model.fit(X_train, y_train)

print("Model training complete!")

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions
predictions = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2f}%")

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(model, feature_names=features, class_names=['Perished', 'Survived'], filled=True)
plt.show()

In [None]:
# now we use the random forest as decision tree have the tendancy to learn on train data so it will not work well on unforsenn data

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest
# n_estimators=100 means we are building 100 different trees
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

print("Random Forest training complete!")

In [None]:
from sklearn.metrics import accuracy_score

rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Day 4 Decision Tree Accuracy: {accuracy:.2f}%")

In [None]:
print(f"Random Forest Accuracy: {rf_accuracy:.2f}%")

In [None]:
# Get importance levels
importances = pd.Series(rf_model.feature_importances_, index=features)
importances = importances.sort_values(ascending=False)

# Plot
importances.plot(kind='barh', color='teal')
plt.title('Which features mattered most to the Random Forest?')
plt.show()

In [None]:
# Change these values: [Pclass, Age, SibSp, Parch, Fare, FamilySize, Sex_male, Embarked_Q, Embarked_S]
my_data = [[1, 25, 0, 0, 100, 1, 1, 0, 1]] 

prediction = rf_model.predict(my_data)

if prediction[0] == 1:
    print("The model predicts: You Survived! ðŸš¢")
else:
    print("The model predicts: You did not survive. ðŸŒŠ")

In [None]:
# This saves your table with all the new features you created
df_encoded.to_csv('titanic_processed_data.csv', index=False)

print("Processed data saved locally as CSV!")