In [11]:
### 1. Setup and Data Loading

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
# Load the Titanic dataset
df = pd.read_csv('Module.csv') 
# Ensure 'titanic.csv' is in the same directory as your Jupyter Notebook or provide the full path.

print("Original DataFrame Head:")
print(df.head())
print("\nOriginal DataFrame Info:")
df.info()

Original DataFrame Head:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       ---

In [13]:
### 2. Train/Test Split

X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (712, 14)
Testing set size: (179, 14)


In [14]:
### 3. Preprocessing for Columns (Handling Missing Values, Scaling, Encoding)

#### Separate Column Types

numeric_features = ['age', 'fare', 'pclass']
categorical_features = ['sex', 'embarked']

In [15]:
#### Numeric Pipeline: Handle Missing Values + Scale

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),#Takes thee mean of the row of that NaN point and replace its value by the mean value.
    ('scaler', StandardScaler())#StandardScaler transforms each feature so that it has a mean of 0 and a standard deviation of 1.
])

#### Categorical Pipeline: Handle Missing Values + Encode

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),# Fillups or replaces the NaN or null value with the most frequent value.
    ('encoder', OneHotEncoder(handle_unknown='ignore'))#This encoder converts the categories into a numerical format. It creates new binary (0 or 1) columns for each unique category.
])

In [16]:
#### Combine into Column Transformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [17]:
### 4. Model Definition and Training (Fit/Predict)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
### 5. Evaluation
#### Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

Accuracy: 0.80


In [19]:
#### Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[88 17]
 [19 55]]


In [20]:
#### Classification Report

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       105
           1       0.76      0.74      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

