In [1]:
import pandas as pd

df = pd.read_csv("diabetes_cleaned.csv")
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.468492,1.425995,1
1,-0.844885,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,-0.365061,-0.190672,0
2,1.23388,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.604397,-0.105584,1
3,-0.844885,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,-0.920763,-1.041549,0
4,-1.141852,0.504422,-2.679076,0.670643,0.316566,1.549303,5.484909,-0.020496,1


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
corr_matrix = df.corr()

# Check for correlation above 0.8 or below -0.8 (strong)
strong_corr = corr_matrix[(corr_matrix > 0.8) & (corr_matrix < 1.0)]
print("Strong correlations:\n", strong_corr)


Strong correlations:
                           Pregnancies  Glucose  BloodPressure  SkinThickness  \
Pregnancies                       NaN      NaN            NaN            NaN   
Glucose                           NaN      NaN            NaN            NaN   
BloodPressure                     NaN      NaN            NaN            NaN   
SkinThickness                     NaN      NaN            NaN            NaN   
Insulin                           NaN      NaN            NaN            NaN   
BMI                               NaN      NaN            NaN            NaN   
DiabetesPedigreeFunction          NaN      NaN            NaN            NaN   
Age                               NaN      NaN            NaN            NaN   
Outcome                           NaN      NaN            NaN            NaN   

                          Insulin  BMI  DiabetesPedigreeFunction  Age  Outcome  
Pregnancies                   NaN  NaN                       NaN  NaN      NaN  
Glucose        

In [3]:
# High BMI + Glucose = potential risk
df['risk_score'] = df['BMI'] * df['Glucose']

# Age bins
df['age_group'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60, 100], labels=False)

df[['risk_score', 'age_group']].head()


Unnamed: 0,risk_score,age_group
0,0.1443,
1,1.026957,
2,-2.687202,
3,0.680514,
4,0.781503,


In [4]:
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)


In [5]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = df.drop(columns=['Outcome'])
y = df['Outcome']

selector = SelectKBest(score_func=mutual_info_classif, k='all')
selector.fit(X, y)

feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

print(feature_scores)


                    Feature     Score
1                   Glucose  0.122812
5                       BMI  0.084051
7                       Age  0.069976
4                   Insulin  0.033909
0               Pregnancies  0.028489
2             BloodPressure  0.021526
6  DiabetesPedigreeFunction  0.009180
8                risk_score  0.007903
3             SkinThickness  0.000000


In [6]:
final_features = ['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'risk_score']
X_selected = df[final_features]
y = df['Outcome']


In [7]:
# Save the reduced feature dataset
X_selected['Outcome'] = y
X_selected.to_csv("diabetes_selected_features.csv", index=False)
print("Selected feature dataset saved!")


Selected feature dataset saved!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected['Outcome'] = y
