In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


Missing Values Ratio

In [None]:
x = X.iloc[:, 1:]
data = (x==0).sum()/len(x)*100
data = pd.DataFrame({
    'Column Name': x.columns,
    'Missing Percentage': data.values
})
data

Unnamed: 0,Column Name,Missing Percentage
0,Glucose,0.651042
1,BloodPressure,4.557292
2,SkinThickness,29.557292
3,Insulin,48.697917
4,BMI,1.432292
5,DiabetesPedigreeFunction,0.0
6,Age,0.0


In [None]:
column_drop = data[data['Missing Percentage'] > 30]['Column Name'].values
X_cleaned = X.drop(column_drop, axis=1)
X_cleaned.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,33.6,0.627,50
1,1,85,66,29,26.6,0.351,31
2,8,183,64,0,23.3,0.672,32
3,1,89,66,23,28.1,0.167,21
4,0,137,40,35,43.1,2.288,33


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 74.68%
Confusion Matrix:
[[78 21]
 [18 37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Calculate correlation matrix
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after removing features with > 30% missing values: {accuracy}')

Accuracy after removing features with > 30% missing values: 0.7467532467532467


# High Correlation Filter

In [None]:
df.corr().abs()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,0.081672,0.073535,0.017683,0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,0.11397,0.074752
Insulin,0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,0.11397,0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [None]:
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_features = [column for column in upper.columns if any(upper[column] > 0.8)]

df_reduced = df.drop(columns=high_corr_features)

X = df_reduced.drop('Outcome', axis=1)
y = df_reduced['Outcome']

mdl = LogisticRegression(max_iter=1000)
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after removing highly correlated features: {accuracy}')

Accuracy after removing highly correlated features: 0.7467532467532467


# Low Variance Filter

In [None]:
from sklearn.preprocessing import normalize

y = df['Outcome']
dF = df.iloc[:, :-1]

data_normalized = normalize(dF)
data_normalized = pd.DataFrame(data_normalized, columns=dF.columns)

data_variance = pd.DataFrame({
    'Column Name': data_normalized.columns,
    'Variance': data_normalized.var().values
})

columns_to_drop = data_variance[data_variance['Variance'] < 0.0006]['Column Name'].values

data_reduced = dF.drop(columns=columns_to_drop, axis=1)

X = data_reduced

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after low variance filter: {accuracy}')

Accuracy after low variance filter: 0.7532467532467533


# Forward Feature Selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
sfs = SequentialFeatureSelector(model,
                                n_features_to_select='auto',
                                direction='forward',
                                scoring='accuracy',
                                cv=5)

sfs.fit(X_train, y_train)

selected_features = X.columns[sfs.get_support()]
print("Selected Features:", selected_features)

X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with selected features: {accuracy}')
print("Optimal number of features:", sfs.n_features_to_select_)

Selected Features: Index(['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction'], dtype='object')
Accuracy with selected features: 0.7727272727272727
Optimal number of features: 4


# Backward Feature Elimination

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)

sfs = SequentialFeatureSelector(model,
                                 n_features_to_select='auto',
                                 direction='backward',
                                 scoring='accuracy',
                                 cv=5)
sfs.fit(X_train, y_train)

removed_features = X.columns[~sfs.get_support()]
print("Removed Features:", removed_features)

X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

model.fit(X_train_selected, y_train)

y_pred = model.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with selected features: {accuracy}')

Removed Features: Index(['Pregnancies', 'BloodPressure', 'SkinThickness', 'Insulin'], dtype='object')
Accuracy with selected features: 0.7727272727272727


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

top_5_features = X_train.columns[indices[:5]]

# Select the top 5 features from the DataFrames
X_train_rf_top5 = X_train[top_5_features]
X_test_rf_top5 = X_test[top_5_features]

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_rf_top5, y_train)
y_pred = rfc.predict(X_test_rf_top5)

print("Accuracy with top 5 important features:", accuracy_score(y_test, y_pred))

Accuracy with top 5 important features: 0.7792207792207793
