Import necessary libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Load the Dataset

In [None]:
# Define the file name
file_name = 'Titanic-Dataset.csv'

# Load the dataset
try:
    df = pd.read_csv("/content/Titanic-Dataset.csv")
    print(f"Successfully loaded '{file_name}'.")

    # Display the first 5 rows to check the data
    print("\n--- Data Head ---")
    print(df.head())

    # Display info about columns and missing values
    print("\n--- Data Info ---")
    df.info()

except FileNotFoundError:
    print(f"Error: The file '{file_name}' was not found.")
    print("Please make sure you have uploaded the file to Colab.")

Successfully loaded 'Titanic-Dataset.csv'.

--- Data Head ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123     

Preprocessing - Handle Missing Data

In [None]:
# 1. Handle Missing 'Age'
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)
print(f"Filled missing 'Age' values with median: {median_age}")

# 2. Handle Missing 'Embarked'
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)
print(f"Filled missing 'Embarked' values with mode: '{mode_embarked}'")

Filled missing 'Age' values with median: 28.0
Filled missing 'Embarked' values with mode: 'S'


Preprocessing - Feature Engineering & Cleanup

In [None]:
# 3. Drop Unnecessary Columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df = df.drop(columns=columns_to_drop)
print(f"Dropped columns: {columns_to_drop}")

# 4. Encode Categorical Variables
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
print("Converted 'Sex' and 'Embarked' to numerical columns.")

# Check the final preprocessed data
print("\n--- Preprocessed Data Head ---")
print(df.head())

Dropped columns: ['PassengerId', 'Name', 'Ticket', 'Cabin']
Converted 'Sex' and 'Embarked' to numerical columns.

--- Preprocessed Data Head ---
   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
0         0       3  22.0      1      0   7.2500      True       False   
1         1       1  38.0      1      0  71.2833     False       False   
2         1       3  26.0      0      0   7.9250     False       False   
3         1       1  35.0      1      0  53.1000     False       False   
4         0       3  35.0      0      0   8.0500      True       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  


Define Features (X) and Target (y)

In [None]:
# 'X' contains all our feature columns
X = df.drop('Survived', axis=1)

# 'y' is our target variable
y = df['Survived']

print("Features (X):", X.columns.tolist())
print("Target (y): 'Survived'")

Features (X): ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']
Target (y): 'Survived'


Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Data split into 712 training samples and 179 testing samples.


Feature Scaling

In [None]:
scaler = StandardScaler()

# Fit the scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform both the training and testing data
X_test_scaled = scaler.transform(X_test)

print("Features have been scaled.")

Features have been scaled.


Train the Model

In [None]:
# Initialize the model
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

print("Model training complete.")

Model training complete.


Evaluate the Model

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# 1. Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Accuracy ---")
print(f"{accuracy:.4f} (or {accuracy*100:.2f}%)")

# 2. Print Confusion Matrix
print("\n--- Confusion Matrix ---")
# [[True Negatives,  False Positives],
#  [False Negatives, True Positives]]
print(confusion_matrix(y_test, y_pred))

# 3. Print Classification Report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))


--- Model Accuracy ---
0.8101 (or 81.01%)

--- Confusion Matrix ---
[[90 15]
 [19 55]]

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

