In [1]:
# Step 1: Importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [3]:
# Step 2: Loading the Titanic dataset using seaborn
df = sns.load_dataset('titanic')

# Let's take a look at the first few rows
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# Step 3: Exploring the data

# Checking basic information about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
# Let's check how many missing values are present in each column
df.isnull().sum()


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [27]:
# Step 4: Cleaning the data

# Dropping columns that are not useful or have too many missing values
# Also dropping 'alone' which is a derived column we don't need
df_cleaned = df.drop(columns=['deck', 'embark_town', 'alive', 'who', 'adult_male', 'class', 'alone'])

# Filling missing age values with the average age
df_cleaned['age'].fillna(df_cleaned['age'].mean(), inplace=True)

# Filling missing embarked values with the most common value
df_cleaned['embarked'].fillna(df_cleaned['embarked'].mode()[0], inplace=True)

# Dropping any remaining rows with missing values (if any)
df_cleaned.dropna(inplace=True)

# Double-checking for missing values again
df_cleaned.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['age'].fillna(df_cleaned['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['embarked'].fillna(df_cleaned['embarked'].mode()[0], inplace=True)


survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

In [29]:
# Step 5: Transforming the data

# Separate input features and target
X = df_cleaned.drop('survived', axis=1)
y = df_cleaned['survived']

# Define categorical and numerical columns
categorical_cols = ['sex', 'embarked']
numerical_cols = ['age', 'fare']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Create preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
], remainder='passthrough')

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

# Get OneHot column names
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(categorical_cols)

# Get passthrough columns dynamically
# This will give you ['pclass', 'sibsp', 'parch'] correctly
passthrough_cols = [col for col in X.columns if col not in numerical_cols + categorical_cols]

# Build final column list
final_columns = numerical_cols + list(cat_feature_names) + passthrough_cols

# Create DataFrame safely
X_transformed_df = pd.DataFrame(X_transformed, columns=final_columns)

# Preview the DataFrame
X_transformed_df.head()


Unnamed: 0,age,fare,sex_male,embarked_Q,embarked_S,pclass,sibsp,parch
0,-0.592481,-0.502445,1.0,0.0,1.0,3.0,1.0,0.0
1,0.638789,0.786845,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.284663,-0.488854,0.0,0.0,1.0,3.0,0.0,0.0
3,0.407926,0.42073,0.0,0.0,1.0,1.0,1.0,0.0
4,0.407926,-0.486337,1.0,0.0,1.0,3.0,0.0,0.0


In [33]:
# Step 6: Saving the transformed dataset

# Add the target column 'survived' to the transformed DataFrame
final_df = X_transformed_df.copy()
final_df['survived'] = y.reset_index(drop=True)

# Save to a CSV file
final_df.to_csv("titanic_transformed.csv", index=False)

print("✅ Done! The cleaned and transformed dataset has been saved as 'titanic_transformed.csv'")


✅ Done! The cleaned and transformed dataset has been saved as 'titanic_transformed.csv'
