In [None]:
# TASK 1: DATA CLEANING - TITANIC DATASET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("="*60)
print("STEP 1: LOAD AND PREVIEW DATA")
print("="*60)

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
print("\n✓ Dataset loaded!")
print(f"Shape: {df.shape}")
print(df.head())
print(f"\nData Types:\n{df.dtypes}")
print(f"\nBasic Stats:\n{df.describe()}")

print("\n" + "="*60)
print("STEP 2: MISSING VALUE ANALYSIS")
print("="*60)
missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_data.values,
    'Percentage': missing_percent.values
})
print("\nMissing Values:")
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

print("\n" + "="*60)
print("STEP 3: HANDLE MISSING VALUES")
print("="*60)
age_median = df['Age'].median()
df['Age'].fillna(age_median, inplace=True)
print(f" ✓ Filled Age with median: {age_median}")
embarked_mode = df['Embarked'].mode()[0]
df['Embarked'].fillna(embarked_mode, inplace=True)
print(f" ✓ Filled Embarked with mode: {embarked_mode}")
df.drop('Cabin', axis=1, inplace=True)
print(" ✓ Dropped Cabin column")
print(f"Total missing values now: {df.isnull().sum().sum()}")

print("\n" + "="*60)
print("STEP 4: REMOVE DUPLICATES")
print("="*60)
duplicates_before = df.duplicated().sum()
if duplicates_before > 0:
    df.drop_duplicates(inplace=True)
print(f" ✓ Removed {duplicates_before} duplicates")

print("\n" + "="*60)
print("STEP 5: CHECK INCONSISTENCIES")
print("="*60)
print(f"Passenger Class: {sorted(df['Pclass'].unique())}")
print(f"Sex: {df['Sex'].unique()}")
print(f"Embarked: {df['Embarked'].unique()}")
print(" ✓ Categorical columns are consistent")

print("\n" + "="*60)
print("STEP 6: DETECT OUTLIERS")
print("="*60)
print("\nAge Outliers (IQR method):")
Q1_age = df['Age'].quantile(0.25)
Q3_age = df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
lower = Q1_age - 1.5 * IQR_age
upper = Q3_age + 1.5 * IQR_age
outliers_age = df[(df['Age'] < lower) | (df['Age'] > upper)]
print(f" Q1: {Q1_age}, Q3: {Q3_age}, IQR: {IQR_age}")
print(f" Bounds: [{lower}, {upper}]")
print(f" Found {len(outliers_age)} outliers (realistic values)")

print("\nFare Outliers (IQR method):")
Q1_fare = df['Fare'].quantile(0.25)
Q3_fare = df['Fare'].quantile(0.75)
IQR_fare = Q3_fare - Q1_fare
lower_fare = Q1_fare - 1.5 * IQR_fare
upper_fare = Q3_fare + 1.5 * IQR_fare
outliers_fare = df[(df['Fare'] < lower_fare) | (df['Fare'] > upper_fare)]
print(f" Q1: {Q1_fare}, Q3: {Q3_fare}, IQR: {IQR_fare}")
print(f" Bounds: [{lower_fare}, {upper_fare}]")
print(f" Found {len(outliers_fare)} outliers (realistic values)")

print("\n" + "="*60)
print("STEP 7: ENCODE CATEGORICAL VARIABLES")
print("="*60)
print("\nOriginal Sex:")
print(df['Sex'].head(5))
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
print("\nEncoded Sex (1=male, 0=female):")
print(df['Sex'].head(5))

print("\n" + "="*60)
print("STEP 8: FINAL SUMMARY")
print("="*60)
print(f"\n✓ Final Shape: {df.shape}")
print(f"✓ Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"✓ Missing Values: {df.isnull().sum().sum()}")
print(f"\nFinal Columns: {df.columns.tolist()}")
print("\nFinal Preview:")
print(df.head())
print("\nFinal Statistics:")
print(df.describe())

# Save
output_path = "titanic_cleaned.csv"
df.to_csv(output_path, index=False)
print(f"\n✓ Cleaned dataset saved as: {output_path}")

print("\n" + "="*60)
print("SUMMARY OF CHANGES")
print("="*60)
print("""
1. ✓ Loaded Titanic dataset (891 rows, 12 columns)
2. ✓ Handled Missing Values:
   - Filled Age with median (177 values)
   - Filled Embarked with mode (2 values)
   - Dropped Cabin column (77% missing)
3. ✓ Removed Duplicates: 0 duplicates
4. ✓ Fixed Inconsistencies: Checked all categorical columns
5. ✓ Detected Outliers: Found realistic outliers in Age and Fare
6. ✓ Encoded: Sex converted to 1/0
7. ✓ Final: 891 rows, 11 columns
8. ✓ Saved: titanic_cleaned.csv
""")

STEP 1: LOAD AND PREVIEW DATA

✓ Dataset loaded!
Shape: (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(embarked_mode, inplace=True)
