In [2]:
# Load the Titanic dataset and inspect the first few rows
import pandas as pd

# Load the dataset
file_path = '/mnt/data/Titanic.csv'
titanic_data = pd.read_csv('TitanicDataset.csv')

# Display the first few rows and the structure of the dataset
titanic_data.head(), titanic_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [3]:
# Task 1: Data Preprocessing and Handling Missing Values

# Step 3: Identify missing values
missing_values = titanic_data.isnull().sum()

# Fill missing values in numerical columns with the mean
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

# Fill missing values in categorical columns with the mode
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Drop columns with excessive missing values (e.g., 'Cabin')
titanic_data.drop(columns=['Cabin'], inplace=True)

# Verify the preprocessing steps
missing_values_after = titanic_data.isnull().sum()

# Display changes in missing values
missing_values, missing_values_after


(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64,
 PassengerId    0
 Survived       0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Embarked       0
 dtype: int64)

In [4]:
# Task 2: Encoding Categorical Variables

# Identify categorical columns
categorical_columns = ['Sex', 'Embarked']

# Apply One-Hot Encoding for nominal data (Embarked)
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True)

# Apply Label Encoding for ordinal data (Sex)
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})

# Verify the encoded columns
titanic_data[categorical_columns + ['Embarked_C', 'Embarked_Q', 'Embarked_S']].head()


KeyError: "['Embarked', 'Embarked_C'] not in index"

In [5]:
# Verify the encoded columns
encoded_columns = ['Sex', 'Embarked_C', 'Embarked_Q']
titanic_data[encoded_columns].head()


KeyError: "['Embarked_C'] not in index"

In [6]:
# Verify the transformed columns for encoding
titanic_data[['Sex', 'Embarked_Q', 'Embarked_S']].head()


Unnamed: 0,Sex,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,1,0,1
3,1,0,1
4,0,0,1


In [7]:
# Task 3: Creating New Features

# Step 1: Create FamilySize by combining SibSp and Parch
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

# Step 2: Create IsAlone feature (1 if traveling alone, 0 otherwise)
titanic_data['IsAlone'] = (titanic_data['FamilySize'] == 1).astype(int)

# Verify the newly created features
titanic_data[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head()


Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,1,0,2,0
1,1,0,2,0
2,0,0,1,1
3,1,0,2,0
4,0,0,1,1


In [8]:
# Task 4: Feature Scaling

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify numerical features that need scaling
numerical_features = ['Age', 'Fare', 'FamilySize']

# Standardization: Zero mean and unit variance
scaler_standard = StandardScaler()
titanic_data[numerical_features] = scaler_standard.fit_transform(titanic_data[numerical_features])

# Verify the scaling results for Standardization
titanic_data[numerical_features].head()


Unnamed: 0,Age,Fare,FamilySize
0,-0.592481,-0.502445,0.05916
1,0.638789,0.786845,0.05916
2,-0.284663,-0.488854,-0.560975
3,0.407926,0.42073,0.05916
4,0.407926,-0.486337,-0.560975


In [13]:
# Use only non-scaled and non-negative features for Chi-Square Test
X_non_negative = titanic_data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']].copy()

# Ensure Fare is non-negative
X_non_negative['Fare'] = X_non_negative['Fare'] - X_non_negative['Fare'].min()  # Shift to make all values non-negative

# Chi-Square Test
chi2_selector = SelectKBest(score_func=chi2, k=5)
X_chi2_selected = chi2_selector.fit_transform(X_non_negative, y)

# Selected features based on Chi-Square Test
chi2_scores = pd.DataFrame({
    'Feature': X_non_negative.columns,
    'Score': chi2_selector.scores_
}).sort_values(by='Score', ascending=False)

# Display results
chi2_scores


Unnamed: 0,Feature,Score
1,Sex,170.348127
4,Fare,90.974941
0,Pclass,30.873699
3,Parch,10.097499
2,SibSp,2.581865


In [14]:
# Task 6: Dimensionality Reduction with PCA
from sklearn.decomposition import PCA

# Step 1: Standardize the data (Numerical features only)
numerical_features = ['Age', 'Fare', 'FamilySize', 'Pclass', 'SibSp', 'Parch']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(titanic_data[numerical_features])

# Step 2: Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Add principal components to the DataFrame for visualization
titanic_data['PCA1'] = principal_components[:, 0]
titanic_data['PCA2'] = principal_components[:, 1]

# Step 3: Analyze variance explained by each component
explained_variance = pca.explained_variance_ratio_

# Display explained variance and transformed data
explained_variance, titanic_data[['PCA1', 'PCA2']].head()


(array([0.42871313, 0.28148869]),
        PCA1      PCA2
 0  0.118644 -1.147793
 1 -0.050276  1.760875
 2 -0.816695 -0.991990
 3 -0.061701  1.445131
 4 -0.977311 -0.712614)