Data Preprocessing and Cleaning

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/titanic_data.csv')

# Basic info
print(df.info())
print(df.describe())
print(df.head())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     889 non-null    int64  
 1   pclass       889 non-null    int64  
 2   sex          889 non-null    object 
 3   age          713 non-null    float64
 4   sibsp        889 non-null    int64  
 5   parch        889 non-null    int64  
 6   fare         889 non-null    float64
 7   embarked     887 non-null    object 
 8   class        889 non-null    object 
 9   who          889 non-null    object 
 10  adult_male   889 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  887 non-null    object 
 13  alive        889 non-null    object 
 14  alone        889 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.2+ KB
None
         survived      pclass         age       sibsp       parch        fare
count  889.000000  889.000

In [6]:
# Fill missing numerical values with mean/median
# Imputing 'age' with the mean
df['age'] = df['age'].fillna(df['age'].mean())

# For categorical columns, fill with mode
# Imputing 'embarked' with the mode
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
# Imputing 'embark_town' with the mode
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])

# Note: 'deck' has a large number of missing values.
# You might choose to drop this column or handle it differently,
# but if you wanted to fill it with the mode:
# df['deck'] = df['deck'].fillna(df['deck'].mode()[0])

# Check missing values again to confirm imputation
print("\nMissing values after imputation:")
print(df.isnull().sum())



Missing values after imputation:
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           686
embark_town      0
alive            0
alone            0
dtype: int64


In [12]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['sex'])  # male:1, female:0

# Check if 'embarked' column exists before one-hot encoding
if 'embarked' in df.columns:
    # One-hot encode 'embarked'
    df = pd.get_dummies(df, columns=['embarked'], drop_first=True)
else:
    print("Error: 'embarked' column not found in the DataFrame. Cannot perform one-hot encoding.")

Error: 'embarked' column not found in the DataFrame. Cannot perform one-hot encoding.


In [19]:
# Add this line to inspect columns
print("\nColumns before dropping:")
print(df.columns)

# Not useful for prediction
# Check if columns exist before dropping
columns_to_drop = ['sex', 'fare', 'class']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if existing_columns_to_drop:
    df = df.drop(columns=existing_columns_to_drop)
    print(f"\nDropped columns: {existing_columns_to_drop}")
else:
    print("\nNone of the specified columns ('sex', 'fare', 'class') found in the DataFrame.")

# Optional: Print columns after dropping to confirm
print("\nColumns after dropping:")
print(df.columns)


Columns before dropping:
Index(['survived', 'pclass', 'sex', 'sibsp', 'parch', 'class', 'adult_male',
       'deck', 'embark_town', 'alive', 'alone', 'Sex', 'embarked_Q',
       'embarked_S'],
      dtype='object')

Dropped columns: ['sex', 'class']

Columns after dropping:
Index(['survived', 'pclass', 'sibsp', 'parch', 'adult_male', 'deck',
       'embark_town', 'alive', 'alone', 'Sex', 'embarked_Q', 'embarked_S'],
      dtype='object')


In [27]:
# Filter the DataFrame to remove rows identified as outliers in any of the columns
import numpy as np
import pandas as pd # Ensure pandas is imported in this cell if it's the first one using it

# Add this check to see the columns right before the error
print("\nColumns in df before outlier removal:")
print(df.columns)

# --- Define outlier_conditions ---
# This is a placeholder. You need to replace 'some_column' and 'some_condition'
# with your actual outlier detection logic. For example, using IQR:

# Check if 'age' column exists before proceeding
if 'survived' in df.columns:
    Q1 = df['survived'].quantile(0.25)
    Q3 = df['survived'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Example outlier condition for 'age'
    outlier_conditions = (df['survived'] >= lower_bound) & (df['survived'] <= upper_bound)

    # You would typically create outlier conditions for multiple numerical columns
    # and combine them using the '&' operator if you want to keep rows that are
    # NOT outliers in ANY of the specified columns.
    # For example:
    # numerical_cols = ['age', 'sibsp', 'parch'] # Add other numerical columns
    # outlier_conditions = pd.Series([True] * len(df)) # Start with all True
    # for col in numerical_cols:
    #     if col in df.columns: # Add check for each column
    #         Q1 = df[col].quantile(0.25)
    #         Q3 = df[col].quantile(0.75)
    #         IQR = Q3 - Q1
    #         lower_bound = Q1 - 1.5 * IQR
    #         upper_bound = Q3 + 1.5 * IQR
    #         col_condition = (df[col] >= lower_bound) & (df[col] <= upper_bound)
    #         outlier_conditions = outlier_conditions & col_condition
    #     else:
    #         print(f"Warning: Column '{col}' not found for outlier detection.")


    # --- Apply the condition to create df_cleaned ---
    df_cleaned = df[outlier_conditions]

    # --- The original code below this line ---
    # df = df_cleaned # This line is redundant if you continue using df_cleaned

    print(f"\nDataFrame shape after outlier removal: {df_cleaned.shape}")
    print(f"Number of rows removed: {df.shape[0] - df_cleaned.shape[0]}")

    # Now df_cleaned contains the data with outliers removed from the specified numerical columns
    # You can continue using df_cleaned or assign it back to df if preferred:
    #df = df_cleaned

else:
    print("Error: 'survived' column not found in the DataFrame. Cannot perform outlier removal based on 'age'.")


Columns in df before outlier removal:
Index(['survived', 'pclass', 'sibsp', 'parch', 'adult_male', 'deck',
       'embark_town', 'alive', 'alone', 'Sex', 'embarked_Q', 'embarked_S'],
      dtype='object')

DataFrame shape after outlier removal: (889, 12)
Number of rows removed: 0


In [33]:
from sklearn.preprocessing import StandardScaler # Import StandardScaler
scaler = StandardScaler()

df[['survived', 'parch']] = scaler.fit_transform(df[['survived', 'parch']])


In [37]:
from sklearn.model_selection import train_test_split
X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
