In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score



In [3]:
df = pd.read_excel('Testdataset3.xlsx')
df.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# Data Cleaning (with debugging and handling for 'Embarked')
df['Age'] = df['Age'].fillna(df['Age'].median())

# Debugging: Inspect 'Embarked'
print("Unique values in 'Embarked':", df['Embarked'].unique())
print("Number of missing values in 'Embarked':", df['Embarked'].isnull().sum())

if df['Embarked'].isnull().all():
    print("All values in 'Embarked' are NaN. Filling with 'S'.")
    df['Embarked'] = 'S'  # Fill with 'S'
else:
    print("Filling missing 'Embarked' values with mode.")
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})  # Map after handling

# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

X = df[features]
y = df[target]

# Double-check for remaining NaNs
print("\nRemaining NaNs in X:")
print(X.isnull().sum())

# Handle any remaining NaNs (example: fill with median for numerical columns)
for col in X.columns:
    if X[col].isnull().any():
        if X[col].dtype in ['int64', 'float64']:  # Check if numerical
            X[col] = X[col].fillna(X[col].median())

# Verify again
print("\nRemaining NaNs in X after handling:")
print(X.isnull().sum())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

Unique values in 'Embarked': [nan]
Number of missing values in 'Embarked': 891
All values in 'Embarked' are NaN. Filling with 'S'.

Remaining NaNs in X:
Pclass        0
Sex         891
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

Remaining NaNs in X after handling:
Pclass        0
Sex         891
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].median())


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [13]:


# Evaluate the model using a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Calculate metrics from the confusion matrix
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nMetrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Explanation of the confusion matrix and model performance
print("\nExplanation:")
print("The confusion matrix shows the following:")
print(" - True Positives (TP):", conf_matrix[1, 1], "passengers who survived and were predicted to survive.")
print(" - True Negatives (TN):", conf_matrix[0, 0], "passengers who did not survive and were predicted not to survive.")
print(" - False Positives (FP):", conf_matrix[0, 1], "passengers who did not survive but were predicted to survive.")
print(" - False Negatives (FN):", conf_matrix[1, 0], "passengers who survived but were predicted not to survive.")

print("\nModel Performance:")
print("The accuracy of the model is", round(accuracy, 3), ", indicating the overall correctness of the predictions.")
print("Precision is", round(precision, 3), ", showing the model's ability to correctly identify survivors among those predicted as survivors.")
print("Recall is", round(recall, 3), ", showing the model's ability to identify all actual survivors.")
print("The F1-score, which balances precision and recall, is", round(f1, 3), ".")

print("\nImprovements:")
print("To enhance survival prediction accuracy, we could consider the following:")
print("1. Feature Engineering: Create new features from existing ones (e.g., family size, title from name).")
print("2. Model Selection: Try other models like Random Forest, Gradient Boosting, or Support Vector Machines.")
print("3. Hyperparameter Tuning: Optimize the model's parameters using techniques like GridSearchCV.")
print("4. Handling Outliers: Address outliers in numerical features like 'Age' and 'Fare'.")
print("5. More Data: Increase the size of the training dataset if possible.")
print("6. Cross-Validation: Use cross-validation to get a more robust estimate of model performance.")

Confusion Matrix:
 [[89 16]
 [20 54]]

Metrics:
Accuracy: 0.7988826815642458
Precision: 0.7714285714285715
Recall: 0.7297297297297297
F1-Score: 0.75

Explanation:
The confusion matrix shows the following:
 - True Positives (TP): 54 passengers who survived and were predicted to survive.
 - True Negatives (TN): 89 passengers who did not survive and were predicted not to survive.
 - False Positives (FP): 16 passengers who did not survive but were predicted to survive.
 - False Negatives (FN): 20 passengers who survived but were predicted not to survive.

Model Performance:
The accuracy of the model is 0.799 , indicating the overall correctness of the predictions.
Precision is 0.771 , showing the model's ability to correctly identify survivors among those predicted as survivors.
Recall is 0.73 , showing the model's ability to identify all actual survivors.
The F1-score, which balances precision and recall, is 0.75 .

Improvements:
To enhance survival prediction accuracy, we could consider 