In [37]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [38]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Titanic-Dataset.csv")
Titanic_data = pd.read_csv(file_path)

# Review the DataFrame
Titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
#Making a copy
Titanic_data=Titanic_data.dropna().copy() # making sure we work on a copy

# Encoding categoirical variables
categorical_column=['Name','Sex','Ticket']
categorical_existing_column=[col
                             for col in categorical_column
                             if col in Titanic_data.columns]

# LabelEncoder() converts categorical variables into numbers

encoders={} # Encoders to resuse for predictions
for col in categorical_existing_column:
    le=LabelEncoder()
    Titanic_data[col]=le.fit_transform(Titanic_data[col].astype(str))
    encoders[col]=le # it saves encoder

In [40]:
# This step scaling numemerical feature

numerical_column=['PassengerId', 'Survived' , ' Pclass', 'Age', 'Sibsp', 'Parch']
categorical_existing_column

['Name', 'Sex', 'Ticket']

In [41]:
# Recognize numerical columns (Budget, Duration, etc.)

#  Scaling of numerical features
numerical_column = ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch']
existing_numerical_column = [col 
                               for col in numerical_cols 
                               if col in Titanic_data.columns]

#When no numerical column exists the scaler will be set to none
if existing_numerical_column:
        scaler = MinMaxScaler() # use of MinMaxScaler() to make the date normal between 0 and 1
        Titanic_data[existing_numerical_column] = scaler.fit_transform(Titanic_data[existing_numerical_column])
else:
        scaler = None  # No scaling if no numerical columns exist

# Save encoders & scaler for reuse in predictions
#Saves categorical encoders (label_encoders.pkl) for reuse during predictions.
# Saves the numerical scaler (scaler.pkl) so the test data can be transformed in the same way
    
with open("label_encoders.pkl", "wb") as f:
        pickle.dump(encoders, f)

if scaler:
        with open("scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

print(" Preprocessing completed successfully.")

 Preprocessing completed successfully.


In [42]:
# Now lets divide or separte the data into
#y variable, the label
y=Titanic_data["Embarked"]

#X variable, the fearure
X=Titanic_data.drop(columns=["Embarked"])

In [43]:
y.head()

1     C
3     S
6     S
10    S
11    S
Name: Embarked, dtype: object

In [44]:
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
1,0.0,1.0,0.0,49,0,0.468892,0.333333,0.0,109,71.2833,C85
3,0.002252,1.0,0.0,70,0,0.430956,0.333333,0.0,31,53.1,C123
6,0.005631,0.0,0.0,112,1,0.671219,0.0,0.0,55,51.8625,E46
10,0.010135,1.0,1.0,148,0,0.038948,0.333333,0.25,120,16.7,G6
11,0.011261,1.0,0.0,27,0,0.721801,0.0,0.0,26,26.55,C103


In [45]:
# Import train_test_learn module
from sklearn.model_selection import train_test_split

#Now lets divide or split the data 
Size_Test= 0.25 # 25% data for testing
State_Random= 1 # Making sure the reproductibility 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Printing results to verify the split of data
print(f"Training features shape (y_train): {y_train.shape}")
print(f"Testing features shape (y_test): {y_test.shape}")
print(f"Training features shape(X_train): {X_train.shape}")
print(f"Training features shape (X_test): {X_test.shape}")

Training features shape (y_train): (137,)
Testing features shape (y_test): (46,)
Training features shape(X_train): (137, 11)
Training features shape (X_test): (46, 11)


In [46]:
#Dropping non relavent columns
Titanic_data= Titanic_data.drop(columns=['PassengerId','Name','Ticket', 'Cabin'], errors='ignore')                                   

In [47]:
X= Titanic_data.drop(columns=['Survived']) #Features
y= Titanic_data['Survived'] # Target

In [48]:
X=X.apply(pd.to_numeric, errors='coerce')
X.fillna(0, inplace=True) # Fill any remaining Nan with 0

In [49]:
from sklearn.linear_model import LogisticRegression

#Split traing and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

#Logistic Regression Model
logistic_model= LogisticRegression(random_state=1, max_iter=500)
logistic_model.fit(X_train, y_train)