In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
print(f"Dataset shape: {df.shape}")
#print(f"Dataset description: {df.describe}")
df.head()

In [None]:

df.info()

df.describe()

missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(missing_df)

numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, linewidths=0.5)
plt.title('Correlation Heatmap of Titanic Dataset Features')
plt.tight_layout()
plt.show()


In [None]:

df_clean = df.copy()

if df_clean['Age'].isnull().sum() > 0:
    age_median = df_clean['Age'].median()
    df_clean['Age_Imputed'] = df_clean['Age'].isnull().astype(int)
    df_clean['Age'] = df_clean['Age'].fillna(age_median)

if df_clean['Embarked'].isnull().sum() > 0:
    embarked_mode = df_clean['Embarked'].mode()[0]
    df_clean['Embarked_Imputed'] = df_clean['Embarked'].isnull().astype(int)
    df_clean['Embarked'] = df_clean['Embarked'].fillna(embarked_mode)

if df_clean['Cabin'].isnull().sum() > 0:
    df_clean['Cabin_Missing'] = df_clean['Cabin'].isnull().astype(int)
    df_clean['Cabin'] = df_clean['Cabin'].fillna('Unknown')

if df_clean['Fare'].isnull().sum() > 0:
    fare_median = df_clean['Fare'].median()
    df_clean['Fare_Imputed'] = df_clean['Fare'].isnull().astype(int)
    df_clean['Fare'] = df_clean['Fare'].fillna(fare_median)

print("Missing values after imputation:")
print(df_clean.isnull().sum())


In [None]:

Q1 = df_clean['Fare'].quantile(0.25)
Q3 = df_clean['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_clean[(df_clean['Fare'] < lower_bound) | (df_clean['Fare'] > upper_bound)]
print(f"Outliers detected: {len(outliers)} ({len(outliers)/len(df_clean)*100:.1f}%)")

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.boxplot(df_clean['Fare'], vert=True)
plt.title('Fare Distribution - Before Treatment')
plt.ylabel('Fare')

fare_95th = df_clean['Fare'].quantile(0.95)
df_clean['Fare_Original'] = df_clean['Fare'].copy()
df_clean['Fare'] = df_clean['Fare'].clip(upper=fare_95th)

plt.subplot(1, 2, 2)
plt.boxplot(df_clean['Fare'], vert=True)
plt.title('Fare Distribution - After Winsorization')
plt.ylabel('Fare')

plt.tight_layout()
plt.show()


In [34]:
df_clean.to_csv('clean_v1.csv', index=False)

decision_log = """# Decision Log – Titanic Data Processing
…"""
with open('decision_log.md', 'w') as f:
    f.write(decision_log)

data_card = """# Data Card – Titanic Dataset (Clean Version 1)
…"""
with open('data_card.md', 'w') as f:
    f.write(data_card)

import zipfile, os
files_to_zip = ['clean_v1.csv', 'decision_log.md', 'data_card.md']
with zipfile.ZipFile('titanic_processed.zip', 'w') as zf:
    for fname in files_to_zip:
        zf.write(fname)

from IPython.display import FileLink
print("Download the archive:")
display(FileLink('titanic_processed.zip', result_html_prefix="📦 "))

try:
    from google.colab import files as gfiles
    gfiles.download('titanic_processed.zip')
except ImportError:
    pass


Download the archive:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>