In [1]:
import sys
import numpy as np
import pandas as pd
from textwrap import dedent

print('Python version:', sys.version.splitlines()[0])
print('NumPy version:', np.__version__)
print('Pandas version:', pd.__version__)

# Part 1: Load Titanic dataset
try:
    import seaborn as sns
    df = sns.load_dataset('titanic')
    print("\nLoaded Titanic from seaborn (shape={}):".format(df.shape))
except Exception as e:
    print("\nCould not load seaborn dataset (using fallback). Exception:", e)
    df = pd.DataFrame({
        'survived':[0,1,1,0],
        'pclass':[3,1,3,2],
        'sex':['male','female','female','male'],
        'age':[22,38,26,35],
        'sibsp':[1,1,0,0],
        'parch':[0,0,0,0],
        'fare':[7.25,71.2833,7.925,8.05],
        'embarked':['S','C','S','S'],
        'class':['Third','First','Third','Second'],
        'who':['man','woman','woman','man'],
        'alone':[False,False,True,True]
    })
print(df.head())

Python version: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
NumPy version: 2.1.3
Pandas version: 2.2.3

Loaded Titanic from seaborn (shape=(891, 15)):
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [2]:
df_original = df.copy()  

In [3]:
# Assignment 1: Clean Dataset
print("\n\nAssignment 1: Cleaning Titanic Dataset")

df_clean = df.copy()



Assignment 1: Cleaning Titanic Dataset


In [4]:
df_clean.rename(columns=lambda c: str(c).strip().lower().replace(" ", "_"), inplace=True)

In [7]:
before = len(df_clean)
df_clean = df_clean.drop_duplicates()
after = len(df_clean)
print(f"\nDropped duplicates: {before - after}")


Dropped duplicates: 0


In [6]:
print("\nMissing values (before cleaning):")
print(df_clean.isnull().sum())


Missing values (before cleaning):
survived         0
pclass           0
sex              0
age            106
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           582
embark_town      2
alive            0
alone            0
dtype: int64


In [8]:
# 4) Fill missing values
# Age → median
if "age" in df_clean.columns and df_clean["age"].isnull().any():
    df_clean["age"] = df_clean["age"].fillna(df_clean["age"].median())

# Embarked or embark_town → mode
for col in ["embarked", "embark_town"]:
    if col in df_clean.columns and df_clean[col].isnull().any():
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode().iloc[0])

# Fare → median
if "fare" in df_clean.columns and df_clean["fare"].isnull().any():
    df_clean["fare"] = df_clean["fare"].fillna(df_clean["fare"].median())

In [9]:
# 5) Clean categorical strings
for col in df_clean.select_dtypes(include="object").columns:
    df_clean[col] = df_clean[col].astype(str).str.strip()

In [10]:
# 6) Convert data types
if "pclass" in df_clean.columns:
    df_clean["pclass"] = df_clean["pclass"].astype("Int64")

if "survived" in df_clean.columns:
    df_clean["survived"] = df_clean["survived"].astype("Int64")

In [11]:
# 7) Derived features
if {"sibsp", "parch"}.issubset(df_clean.columns):
    df_clean["family_size"] = df_clean["sibsp"].fillna(0).astype(int) + df_clean["parch"].fillna(0).astype(int)
else:
    df_clean["family_size"] = 0

In [12]:
# 8) Missing values report (after)
print("\nMissing values (after cleaning):")
print(df_clean.isnull().sum())


Missing values (after cleaning):
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           582
embark_town      0
alive            0
alone            0
family_size      0
dtype: int64


In [13]:
# 9) Save cleaned CSV
try:
    df_clean.to_csv("titanic_cleaned.csv", index=False)
    print("\nSaved cleaned dataset to 'titanic_cleaned.csv'")
except Exception as e:
    print("Could not save CSV:", e)



Saved cleaned dataset to 'titanic_cleaned.csv'


In [14]:
print("\n\nAssignment 2: 10 Pandas Tasks\n")

def show(title, obj):
    print(f"\n--- {title} ---")
    try:
        display(obj.head(10))
    except:
        print(obj)



Assignment 2: 10 Pandas Tasks



In [15]:
# 1) Top 5 oldest
if "age" in df_clean.columns:
    show("Top 5 Oldest Passengers",
         df_clean.sort_values("age", ascending=False)[["age","sex","survived","pclass"]].head(5))


--- Top 5 Oldest Passengers ---


Unnamed: 0,age,sex,survived,pclass
630,80.0,male,1,1
851,74.0,male,0,3
96,71.0,male,0,1
493,71.0,male,0,1
116,70.5,male,0,3


In [16]:
# 2) Survival rate by gender
if {"sex","survived"}.issubset(df_clean.columns):
    show("Survival Rate by Gender", df_clean.groupby("sex")["survived"].mean())


--- Survival Rate by Gender ---


sex
female    0.740614
male      0.215886
Name: survived, dtype: Float64

In [17]:
# 3) Avg fare by class
if {"pclass","fare"}.issubset(df_clean.columns):
    show("Average Fare by Pclass", df_clean.groupby("pclass")["fare"].mean())


--- Average Fare by Pclass ---


pclass
1    84.487812
2    21.835404
3    13.656223
Name: fare, dtype: float64

In [18]:
# 4) Count children
if "age" in df_clean.columns:
    show("Number of children (<18)", df_clean[df_clean["age"] < 18].shape[0])



--- Number of children (<18) ---
110


In [19]:
# 5) Fare per person
df_clean["fare_per_person"] = df_clean["fare"] / (df_clean["family_size"] + 1)
show("Fare per person sample", df_clean[["fare","family_size","fare_per_person"]].head(8))


--- Fare per person sample ---


Unnamed: 0,fare,family_size,fare_per_person
0,7.25,1,3.625
1,71.2833,1,35.64165
2,7.925,0,7.925
3,53.1,1,26.55
4,8.05,0,8.05
5,8.4583,0,8.4583
6,51.8625,0,51.8625
7,21.075,4,4.215


In [20]:
# 6) High-fare passengers
show("Passengers with fare > 50",
     df_clean[df_clean["fare"] > 50][["fare","pclass","sex","survived"]].sort_values("fare", ascending=False).head(10))



--- Passengers with fare > 50 ---


Unnamed: 0,fare,pclass,sex,survived
679,512.3292,1,male,1
737,512.3292,1,male,1
258,512.3292,1,female,1
27,263.0,1,male,0
341,263.0,1,female,1
88,263.0,1,female,1
438,263.0,1,male,0
742,262.375,1,female,1
311,262.375,1,female,1
299,247.5208,1,female,1


In [21]:
# 7) Null summary
show("Null values summary", df_clean.isnull().sum())


--- Null values summary ---


survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
class       0
who         0
dtype: int64

In [22]:
# 8) Pivot table
if {"sex","survived","class"}.issubset(df_clean.columns):
    show("Survival by Sex × Class", df_clean.pivot_table(values="survived", index="sex", columns="class", aggfunc="mean"))
elif {"sex","survived","pclass"}.issubset(df_clean.columns):
    show("Survival by Sex × Pclass", df_clean.pivot_table(values="survived", index="sex", columns="pclass", aggfunc="mean"))



--- Survival by Sex × Class ---


  show("Survival by Sex × Class", df_clean.pivot_table(values="survived", index="sex", columns="class", aggfunc="mean"))


class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.967742,0.917808,0.472441
male,0.371901,0.184783,0.158273


In [23]:
# 9) Youngest and oldest
if "age" in df_clean.columns:
    show("10 Youngest", df_clean.sort_values("age").head(10))
    show("10 Oldest", df_clean.sort_values("age", ascending=False).head(10))


--- 10 Youngest ---


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,fare_per_person
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False,1,4.25835
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False,2,4.833333
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False,3,4.814575
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False,2,9.666667
831,1,2,male,0.83,1,1,18.75,S,Second,child,False,,Southampton,yes,False,2,6.25
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False,3,37.8875
827,1,2,male,1.0,0,2,37.0042,C,Second,child,False,,Cherbourg,yes,False,2,12.334733
381,1,3,female,1.0,0,2,15.7417,C,Third,child,False,,Cherbourg,yes,False,2,5.247233
172,1,3,female,1.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False,2,3.7111
788,1,3,male,1.0,1,2,20.575,S,Third,child,False,,Southampton,yes,False,3,5.14375



--- 10 Oldest ---


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,fare_per_person
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True,0,30.0
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True,0,7.775
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True,0,34.6542
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True,0,49.5042
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,0,7.75
745,0,1,male,70.0,1,1,71.0,S,First,man,True,B,Southampton,no,False,2,23.666667
672,0,2,male,70.0,0,0,10.5,S,Second,man,True,,Southampton,no,True,0,10.5
33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True,0,10.5
280,0,3,male,65.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,0,7.75
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False,1,30.9896


In [24]:
# 10) Unique embarked values
for col in ["embarked", "embark_town"]:
    if col in df_clean.columns:
        show(f"Unique values in {col}", df_clean[col].dropna().unique().tolist())


--- Unique values in embarked ---
['S', 'C', 'Q']

--- Unique values in embark_town ---
['Southampton', 'Cherbourg', 'Queenstown']


In [25]:
print("\n\n=== Assignment 3: Summary Report ===\n")

report = {
    "shape": df_clean.shape,
    "num_rows": df_clean.shape[0],
    "num_columns": df_clean.shape[1],
    "missing_percent": (df_clean.isnull().sum() / len(df_clean) * 100).round(2)
}




=== Assignment 3: Summary Report ===



In [26]:
# Basic Stats
if "survived" in df_clean.columns:
    report["survival_rate"] = df_clean["survived"].mean()

if "age" in df_clean.columns:
    report["avg_age"] = df_clean["age"].mean().round(2)
    report["median_age"] = df_clean["age"].median()

if "fare" in df_clean.columns:
    report["avg_fare"] = df_clean["fare"].mean().round(2)
    report["median_fare"] = df_clean["fare"].median()

In [27]:
# Insights
insights = []

if "survived" in df_clean.columns:
    insights.append(f"Overall survival rate: {report['survival_rate']:.2%}")

if {"sex","survived"}.issubset(df_clean.columns):
    male_rate = df_clean[df_clean["sex"]=="male"]["survived"].mean()
    female_rate = df_clean[df_clean["sex"]=="female"]["survived"].mean()
    insights.append(f"Female survival: {female_rate:.2%}, Male survival: {male_rate:.2%}")

if {"pclass","fare"}.issubset(df_clean.columns):
    top_class = df_clean.groupby("pclass")["fare"].mean().idxmax()
    insights.append(f"Highest average fare: Pclass {top_class}")

for col in ["embark_town","embarked"]:
    if col in df_clean.columns:
        insights.append(f"Most common embark value in {col}: {df_clean[col].mode().iloc[0]}")
        break

if {"fare","survived"}.issubset(df_clean.columns):
    corr = df_clean[["fare","survived"]].corr().iloc[0,1]
    insights.append(f"Fare–survival correlation: {corr:.3f}")

report["insights"] = insights