# 1.	Firstly, replace all Missing values with relevant figures.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data= pd.read_csv("Dataset_Day7.csv")

In [2]:
missing_values=data.isna().sum()
print(missing_values)

Pregnancies                 0
Glucose                     0
BloodPressure               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [3]:
columns_replace=['Glucose','BloodPressure','BMI','DiabetesPedigreeFunction']

for column in columns_replace:
    data[column] = data[column].replace(0,pd.NA)
    
print("Missing values before replacement:")
print(data.isnull().sum())

for column in columns_replace:
    data[column].fillna(data[column].mean(), inplace = True)
    
print("Missing values after replacement:")
print(data.isnull().sum())

Missing values before replacement:
Pregnancies                  0
Glucose                      5
BloodPressure               35
BMI                         11
DiabetesPedigreeFunction     0
Age                          0
Outcome                      0
dtype: int64
Missing values after replacement:
Pregnancies                 0
Glucose                     0
BloodPressure               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


# 2.	Then remove all existing outliers and get the final data for classification

In [4]:
def remove_outliers(df,columns):
    Q1= df[columns].quantile(0.25)
    Q3= df[columns].quantile(0.75)
    IQR= Q3 - Q1
    df_clean = df[~((df[columns] < (Q1 - 1.5 * IQR)) |(df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_clean

columns_to_check = ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction']
data_clean = remove_outliers(data, columns_to_check)

print("Original dataset shape:", data.shape)
print("Cleaned dataset shape:", data_clean.shape)

Original dataset shape: (768, 7)
Cleaned dataset shape: (720, 7)


# 3.	Split the data into 70% training and 30% testing data. Then, create a logistic regression model with target variable as ‘Outcome’.
a.	Print the default model performance metrics: Accuracy, Precision, Recall, F1Score & AIC


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import statsmodels.api as sm

In [6]:
X = data.drop(columns='Outcome')
y = data['Outcome']

#data into 70% and 30% testing data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=203)

# regression model
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#prediction on test data
y_pred= model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

#calculation of performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

#print performance metrics
print("Accuracy:", accuracy*100)
print("Precision:", precision*100)
print("Recall:", recall*100)
print("F1 score:", f1*100)

Accuracy: 73.59307359307358
Precision: 57.971014492753625
Recall: 55.55555555555556
F1 score: 56.73758865248227


# AIC using statsmodels

In [7]:
X_train_sm = sm.add_constant(X_train) # Adding a constant for the intercept
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()

aic = result.aic

# Print AIC
print(f"AIC:",aic)

Optimization terminated successfully.
         Current function value: 0.451908
         Iterations 6
AIC: 499.3486853177712


In [8]:
# Observations:
->Accuracy is about 73.59%, which means that the model correctly classifies approximately 73.593% of the instances.
#->Precision is about 57.0%, indicating that out of all the positive predictions, around 57.0% are actually positive.
#->Recall is approximately 55.55%, meaning the model correctly identifies 55.55% of all actual positive instances.
#->F1-Score: The F1-score of 56.73% reflects the balance between precision and recall. It is a useful metric when you need to balance both false positives and false negatives.
#->AIC is 499.348, which is a measure used in model selection to indicate the relative quality of a statistical model. Lower AIC values generally indicate a better model fit.

SyntaxError: invalid syntax (2932973753.py, line 2)

# 4.	Plot a F1_score vs threshold curve. Find the threshold for which f1-score is the highest.


In [None]:
thresholds = np.arange(0.0, 1.0, 0.01)
f1_scores = [f1_score(y_test, y_pred_prob >= t) for t in thresholds]

# Plot F1 score vs threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

# Find the threshold that gives the highest F1 score
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1_score = max(f1_scores)

print(f'Best Threshold: {best_threshold}')
print(f'Best F1 Score: {best_f1_score}')