In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#set plot style for better aesthetics
sns.set_style('whitegrid')

In [None]:
!git clone 'https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset'

In [None]:
#Load the dataset from a url
titanic_df = pd.read_csv('/content/21-Days-21-Projects-Dataset/Datasets/Titanic-Dataset.csv')
#Display the first 5 rows
print("First 5 rows of the dataset")
titanic_df.head()


In [None]:
titanic_df.tail()

In [None]:
titanic_df.shape

In [None]:
#Get a concise summary of the dataframe
print("\n Dataset Information:")
titanic_df.info()

In [None]:
#Get descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
titanic_df.describe()

In [None]:
titanic_df['Cabin'].value_counts()

In [None]:
titanic_df.isna().sum()

In [None]:
median = titanic_df['Age'].median()
print(median)

In [None]:
#Handle missing 'Age' values
#We use the median to fill missing ages because the age distribution can be skewed.
median_age = titanic_df['Age'].median()
titanic_df['Age']= titanic_df['Age'].fillna(median_age)

#Verify that there are no more missing values in the columns we handeled so far
print("Missing values after Age cleaning:")
print(titanic_df.isna().sum())

In [None]:
mode= titanic_df['Embarked'].mode()[0]
print(mode)

In [None]:
#2. Handle missing 'Embarked' values
#Since there are only two missing values, we'll fill them wwith thw most common port of embarkation(the mode)
mode_embarked= titanic_df['Embarked'].mode()[0]
titanic_df['Embarked']= titanic_df['Embarked'].fillna(mode_embarked)

#Verify that there are no more missing values in the colums we handled so far
print("Missing values after Embarked cleaning:")
print(titanic_df[['Age','Embarked','Cabin']].isna().sum())

In [None]:
#3. Handle the 'Cabin' column
#With over 77% missing data, inputting is not a good idea.Instead, we'll create a new feature 'Has_Cabin'.
titanic_df['Has_Cabin']= titanic_df['Cabin'].notna().astype(int) # 1 of has cabin,0 if not
titanic_df.drop('Cabin', axis=1, inplace=True) #Drop the original column

In [None]:
titanic_df['Has_Cabin'].value_counts()

In [None]:
titanic_df.head(5)

In [None]:
#Verify that there are no more missing values in the columns we handled
print("Missing Values after cleaning:")
titanic_df.isna().sum()

In [None]:
print("Analyzing categorical features: ")

#Set up the figure for plotting
fig, axes= plt.subplots(2,3,figsize=(18,22))
fig.suptitle('Univariate Analysis of Categorical Features', fontsize= 16)

#Plotting each categorical feature
sns.countplot(ax=axes[0,0], x='Survived', data=titanic_df).set_title('Survival Distribution')
sns.countplot(ax=axes[0, 1], x='Pclass', data=titanic_df).set_title('Passenger Class Distribution')
sns.countplot(ax=axes[0, 2], x='Sex', data=titanic_df).set_title('Gender Distribution')
sns.countplot(ax=axes[1, 0], x='Embarked', data=titanic_df).set_title('Port of Embarkation')
sns.countplot(ax=axes[1, 1], x='SibSp', data=titanic_df).set_title('Siblings/Spouses Aboard')
sns.countplot(ax=axes[1, 2], x='Parch', data=titanic_df).set_title('Parents/Children Aboard')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
print("\nAnalyzing numerical features:")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Univariate Analysis of Numerical Features', fontsize=16)

# Plotting Age distribution
sns.histplot(ax=axes[0], data=titanic_df, x='Age', kde=True, bins=30).set_title('Age Distribution')

# Plotting Fare distribution
sns.histplot(ax=axes[1], data=titanic_df, x='Fare', kde=True, bins=40).set_title('Fare Distribution')

plt.show()

In [None]:
print("Bivariate Analysis: Feature vs. Survival")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Bivariate Analysis with Survival', fontsize=16)

# Pclass vs. Survived
sns.barplot(ax=axes[0, 0], x='Pclass', y='Survived', data=titanic_df).set_title('Survival Rate by Pclass')

# Sex vs. Survived
sns.barplot(ax=axes[0, 1], x='Sex', y='Survived', data=titanic_df).set_title('Survival Rate by Sex')

# Embarked vs. Survived
sns.barplot(ax=axes[1, 0], x='Embarked', y='Survived', data=titanic_df).set_title('Survival Rate by Port')

# Has_Cabin vs. Survived
sns.barplot(ax=axes[1, 1], x='Has_Cabin', y='Survived', data=titanic_df).set_title('Survival Rate by Cabin Availability')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
#Age vs Survival
g= sns.FacetGrid(titanic_df, col='Survived', height=6)
g.map(sns.histplot, 'Age', bins=25, kde=True)
plt.suptitle('Age Distribution by Survival', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize =(10,8))
sns.boxplot(y='Fare', data= titanic_df)
plt.title('Box Plot of Ticket Fare')
plt.ylabel('Fare')
plt.show()

In [None]:
#1. Create a 'FamilySize' Feature
titanic_df['FamilySize']= titanic_df['SibSp'] + titanic_df['Parch'] +1 # +1 for the person themselves
 #2. Create an 'IsAlone' feature
titanic_df['IsAlone'] = 0
titanic_df.loc[titanic_df['FamilySize'] == 1, 'IsAlone'] = 1

print("Created 'FamilySize' and 'IsAlone' features:")
titanic_df[['FamilySize', 'IsAlone']].head()



In [None]:
# Analyze the new family-related features against survival
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Survival Rate by FamilySize
sns.barplot(ax=axes[0], x='FamilySize', y='Survived', data=titanic_df).set_title('Survival Rate by Family Size')

# Survival Rate by IsAlone
sns.barplot(ax=axes[1], x='IsAlone', y='Survived', data=titanic_df).set_title('Survival Rate for Those Traveling Alone')

plt.show()

In [None]:
# 3. Extract 'Title' from the 'Name' column
titanic_df['Title'] = titanic_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Let's see the different titles
print("Extracted Titles:")
titanic_df['Title'].value_counts()

In [None]:
# Simplify the titles by grouping rare ones into a 'Rare' category
titanic_df['Title'] = titanic_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

titanic_df['Title'] = titanic_df['Title'].replace('Mlle', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Ms', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Mme', 'Mrs')

# Let's see the survival rate by the new, cleaned titles
plt.figure(figsize=(12, 6))
sns.barplot(x='Title', y='Survived', data=titanic_df)
plt.title('Survival Rate by Title')
plt.ylabel('Survival Probability')
plt.show()

In [None]:
# Survival rate by Pclass and Sex
sns.catplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df, kind='bar', height=6, aspect=1.5)
plt.title('Survival Rate by Pclass and Sex')
plt.ylabel('Survival Probability')
plt.show()

# Insights: Females in all classes had a significantly higher survival rate than males.

In [None]:
# Violin plot to see age distribution by sex and survival status
plt.figure(figsize=(14, 8))
sns.violinplot(x='Sex', y='Age', hue='Survived', data=titanic_df, split=True, palette={0: 'blue', 1: 'orange'})
plt.title('Age Distribution by Sex and Survival')
plt.show()

In [None]:
# Correlation Heatmap for numerical features
plt.figure(figsize=(14, 10))
numeric_cols = titanic_df.select_dtypes(include=np.number)
correlation_matrix = numeric_cols.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
import pandas as pd

# Create a sample DataFrame
sample_data = {'col1': [1, 2, 3, 4],
               'col2': ['A', 'B', 'C', 'D'],
               'col3': [True, False, True, False]}
sample_df = pd.DataFrame(sample_data)

# Display the sample DataFrame
print("Sample DataFrame:")
display(sample_df)

In [None]:
# Install ydata-profiling
!pip install ydata-profiling -q

In [None]:
# Generate the profiling report
from ydata_profiling import ProfileReport

profile = ProfileReport(sample_df, title="Titanic Dataset Profiling Report")

# Display the report in the notebook
profile.to_notebook_iframe()

In [None]:
# Save the report to an HTML file
profile.to_file("sample.html")

In [None]:
import nbformat

file_path = "/content/Gfg21DaysProjects/Titanic.ipynb"  # adjust if needed

nb = nbformat.read(file_path, as_version=4)

# Remove widgets metadata safely
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

nbformat.write(nb, file_path)

print("Widgets metadata removed successfully.")

In [None]:
!ls


In [None]:
!ls /content/

In [None]:
%cd /content/Gfg21DaysProjects