In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./Diabetes.csv')

In [3]:
print(df.head())

   pregnancies  glucose  diastolic  triceps  insulin   bmi    dpf  age  \
0            6      148         72       35        0  33.6  0.627   50   
1            1       85         66       29        0  26.6  0.351   31   
2            8      183         64        0        0  23.3  0.672   32   
3            1       89         66       23       94  28.1  0.167   21   
4            0      137         40       35      168  43.1  2.288   33   

   diabetes  
0         1  
1         0  
2         1  
3         0  
4         1  


In [4]:
print(df.isnull().sum())

pregnancies    0
glucose        0
diastolic      0
triceps        0
insulin        0
bmi            0
dpf            0
age            0
diabetes       0
dtype: int64


In [7]:
print(df.columns)

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')


In [9]:
columns_to_replace = ['glucose', 'diastolic', 'triceps', 'insulin', 'bmi']  # Corrected capitalization
df[columns_to_replace] = df[columns_to_replace].replace(0, pd.NA)
df.fillna(df.mean(), inplace=True)

  df.fillna(df.mean(), inplace=True)


In [10]:
print(df.dtypes)

pregnancies      int64
glucose        float64
diastolic      float64
triceps        float64
insulin        float64
bmi            float64
dpf            float64
age              int64
diabetes         int64
dtype: object


In [12]:
X = df.drop('diabetes', axis=1)
y = df['diabetes']


In [13]:
from sklearn.preprocessing import StandardScaler

# Assuming 'X' is your feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Now you can use X_scaled
print(X_scaled)

[[ 0.63994726  0.86510807 -0.03351824 ...  0.16629174  0.46849198
   1.4259954 ]
 [-0.84488505 -1.20616153 -0.52985903 ... -0.85253118 -0.36506078
  -0.19067191]
 [ 1.23388019  2.0158134  -0.69530596 ... -1.33283341  0.60439732
  -0.10558415]
 ...
 [ 0.3429808  -0.0225789  -0.03351824 ... -0.91074963 -0.68519336
  -0.27575966]
 [-0.84488505  0.14180757 -1.02619983 ... -0.34311972 -0.37110101
   1.17073215]
 [-0.84488505 -0.94314317 -0.19896517 ... -0.29945588 -0.47378505
  -0.87137393]]


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [15]:
print(f'Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}')

Training set shape: (614, 8), Testing set shape: (154, 8)


In [16]:
df['BMI_Glucose_Interaction'] = df['bmi'] * df['glucose']

In [17]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False) 
poly_features = poly.fit_transform(df[['glucose', 'bmi']]) 
poly_feature_names = poly.get_feature_names_out(['glucose', 'bmi']) 
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

In [18]:
df = pd.concat([df, poly_df], axis=1)

In [20]:
df['Triceps_Diastolic_Sum'] = df['triceps'] + df['diastolic']

In [21]:
bins = [0, 30, 40, 50, 60, 70, 80, 90] 
labels = ['<30', '30-39', '40-49', '50-59', '60-69', '70-79', '80+'] 
df['Age_Group'] = pd.cut(df['age'], bins=bins, labels=labels)

In [22]:
import numpy as np

df['log_insulin'] = np.log1p(df['insulin']) 
df['log_pregnancies'] = np.log1p(df['pregnancies'])

In [23]:
df['DPF_Age_Interaction'] = df['dpf'] * df['age']

In [24]:
print(df.head())

   pregnancies  glucose  diastolic   triceps     insulin   bmi    dpf  age  \
0            6    148.0       72.0  35.00000  155.548223  33.6  0.627   50   
1            1     85.0       66.0  29.00000  155.548223  26.6  0.351   31   
2            8    183.0       64.0  29.15342  155.548223  23.3  0.672   32   
3            1     89.0       66.0  23.00000   94.000000  28.1  0.167   21   
4            0    137.0       40.0  35.00000  168.000000  43.1  2.288   33   

   diabetes  BMI_Glucose_Interaction  glucose   bmi  glucose^2  glucose bmi  \
0         1                   4972.8    148.0  33.6    21904.0       4972.8   
1         0                   2261.0     85.0  26.6     7225.0       2261.0   
2         1                   4263.9    183.0  23.3    33489.0       4263.9   
3         0                   2500.9     89.0  28.1     7921.0       2500.9   
4         1                   5904.7    137.0  43.1    18769.0       5904.7   

     bmi^2  Triceps_Diastolic_Sum Age_Group  log_insulin