In [35]:
import pandas as pd       # For data manipulation
import numpy as np        # For numerical operations
import matplotlib.pyplot as plt  # For plotting (optional)
import seaborn as sns     # For visualization (optional)


In [36]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv("Data1.csv")
df.head()  # Show the first few rows


Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [37]:
# View structure and basic info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    object 
dtypes: float64(7), int64(2), object(1)
memory usage: 9.2+ KB


In [38]:
# Find number of missing values per column
df.isnull().sum()


Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64

In [39]:
# Summary of all columns (numeric and non-numeric)
df.describe(include='all')


Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
unique,,,,,,,,,,3.0
top,,,,,,,,,,2.0
freq,,,,,,,,,,64.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474283,6.881763,269.97825,
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,


In [40]:
df.dtypes



Age                 int64
BMI               float64
Glucose             int64
Insulin           float64
HOMA              float64
Leptin            float64
Adiponectin       float64
Resistin          float64
MCP.1             float64
Classification     object
dtype: object

In [42]:
# Fill missing numeric values with median
df['Age'].fillna(df['Age'].median(), inplace=True)
df['HOMA'].fillna(df['HOMA'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['HOMA'].fillna(df['HOMA'].median(), inplace=True)


In [45]:
# Convert 'Country' and 'Purchased' to categorical types
df['Age'] = df['Age'].astype('category')
df['HOMA'] = df['HOMA'].astype('category')


In [46]:
df.dtypes


Age               category
BMI                float64
Glucose              int64
Insulin            float64
HOMA              category
Leptin             float64
Adiponectin        float64
Resistin           float64
MCP.1              float64
Classification      object
dtype: object

In [47]:
# Use one-hot encoding and drop first to avoid dummy variable trap
df_encoded = pd.get_dummies(df, columns=['Age', 'HOMA'], drop_first=True)
df_encoded.head()


Unnamed: 0,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,MCP.1,Classification,Age_25,Age_28,...,HOMA_6.777364,HOMA_7.0029234,HOMA_7.111918,HOMA_7.836205333,HOMA_8.225983067,HOMA_9.736007333,HOMA_13.22733227,HOMA_15.28534133,HOMA_20.6307338,HOMA_25.05034187
0,23.5,70,2.707,8.8071,9.7024,7.99585,417.114,1,False,False,...,False,False,False,False,False,False,False,False,False,False
1,20.690495,92,3.115,8.8438,5.429285,4.06405,468.786,1,False,False,...,False,False,False,False,False,False,False,False,False,False
2,23.12467,91,4.498,17.9393,22.43204,9.27715,554.697,1,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21.367521,77,3.226,9.8827,7.16956,12.766,928.22,1,False,False,...,False,False,False,False,False,False,False,False,False,False
4,21.111111,92,3.549,6.6994,4.81924,10.57635,773.92,1,False,False,...,False,False,False,False,False,False,False,False,False,False


In [48]:
# Save cleaned and encoded data to a new CSV file
df_encoded.to_csv("Cleaned_Data.csv", index=False)
