In [24]:

# ==============================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

In [15]:
# --- Load dataset ---
df = pd.read_csv("C:\\Users\\ranji\\Documents\\DELL\\Data Analytics & Data Science\\28. 13th july\\titanic.csv")


In [16]:
# --- Basic info ---
print("First few rows:\n", df.head(), "\n")
print("Missing values per column:\n", df.isnull().sum(), "\n")

First few rows:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

In [18]:
# --- Fill missing values ---
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Cabin'] = df['Cabin'].fillna('Unknown')


In [19]:
# --- Encode categorical variables ---
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

In [20]:
# --- Drop irrelevant columns ---
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

print("\nCleaned dataset preview:\n", df.head())


Cleaned dataset preview:
    PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0            1         0       3    1  22.0      1      0   7.2500         2
1            2         1       1    0  38.0      1      0  71.2833         0
2            3         1       3    0  26.0      0      0   7.9250         2
3            4         1       1    0  35.0      1      0  53.1000         2
4            5         0       3    1  35.0      0      0   8.0500         2


In [22]:
# --- Chi-Square Tests (for categorical variables) ---
categorical_cols = ['Pclass', 'Sex', 'Embarked']
target = 'Survived'

print("\n============================")
print("Chi-Square Test Results")
print("============================")

for col in categorical_cols:
    contingency_table = pd.crosstab(df[col], df[target])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"\nFeature: {col}")
    print(f"Chi2 Statistic = {chi2:.4f},  p-value = {p:.4f}")
    if p < 0.05:
        print(" This feature is significantly related to Survival ")
    else:
        print(" No significant relationship with Survival")


Chi-Square Test Results

Feature: Pclass
Chi2 Statistic = 102.8890,  p-value = 0.0000
 This feature is significantly related to Survival 

Feature: Sex
Chi2 Statistic = 260.7170,  p-value = 0.0000
 This feature is significantly related to Survival 

Feature: Embarked
Chi2 Statistic = 25.9645,  p-value = 0.0000
 This feature is significantly related to Survival 


In [23]:
# --- Correlation matrix (for numeric insight) ---
print("\nCorrelation Matrix:\n", df.corr())


Correlation Matrix:
              PassengerId  Survived    Pclass       Sex       Age     SibSp  \
PassengerId     1.000000 -0.005007 -0.035144  0.042939  0.034212 -0.057527   
Survived       -0.005007  1.000000 -0.338481 -0.543351 -0.064910 -0.035322   
Pclass         -0.035144 -0.338481  1.000000  0.131900 -0.339898  0.083081   
Sex             0.042939 -0.543351  0.131900  1.000000  0.081163 -0.114631   
Age             0.034212 -0.064910 -0.339898  0.081163  1.000000 -0.233296   
SibSp          -0.057527 -0.035322  0.083081 -0.114631 -0.233296  1.000000   
Parch          -0.001652  0.081629  0.018443 -0.245489 -0.172482  0.414838   
Fare            0.012658  0.257307 -0.549500 -0.182333  0.096688  0.159651   
Embarked        0.013128 -0.167675  0.162098  0.108262 -0.018754  0.068230   

                Parch      Fare  Embarked  
PassengerId -0.001652  0.012658  0.013128  
Survived     0.081629  0.257307 -0.167675  
Pclass       0.018443 -0.549500  0.162098  
Sex         -0.245489