# Task 1: Binary Classification with PCA & Model Deployment

In [9]:
import pandas as pd
import seaborn as sns

# Load dataset
df = sns.load_dataset('titanic')

# Target and features
X = df.drop('survived', axis=1)
y = df['survived']

# Display shapes
print("Features Shape:", X.shape)
print("Target Shape:", y.shape)

# First 5 rows
print(df.head())

# Handle missing values
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Class distribution
print(df['survived'].value_counts())


Features Shape: (891, 14)
Target Shape: (891,)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
survived
0    549
1    342
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print(df)

# Select relevant columns
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked_q','embarked_s']]

# One-hot encoding
df = pd.get_dummies(df, columns=['sex','embarked','pclass'], drop_first=True)

X = df.drop('survived', axis=1)
y = df['survived']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


     survived   age  sibsp  parch     fare  sex_male  embarked_Q  embarked_S  \
0           0  22.0      1      0   7.2500      True       False        True   
1           1  38.0      1      0  71.2833     False       False       False   
2           1  26.0      0      0   7.9250     False       False        True   
3           1  35.0      1      0  53.1000     False       False        True   
4           0  35.0      0      0   8.0500      True       False        True   
..        ...   ...    ...    ...      ...       ...         ...         ...   
886         0  27.0      0      0  13.0000      True       False        True   
887         1  19.0      0      0  30.0000     False       False        True   
888         0  28.0      1      2  23.4500     False       False        True   
889         1  26.0      0      0  30.0000      True       False       False   
890         0  32.0      0      0   7.7500      True        True       False   

     pclass_2  pclass_3  
0       False

KeyError: "['pclass', 'sex', 'embarked_q', 'embarked_s'] not in index"