In [2]:
from google.colab import drive
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/Diatebes_Prediction/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import pandas as pd

df = pd.read_csv(data_path + "diabetes.csv")

# View basic information of data
print(df.info())
print(df.head())
print(df["Outcome"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8     

In [5]:
# Check for missing values
print(df.isnull().sum())

# Fill in missing numerical values with median
df.fillna(df.median(), inplace=True)

print("Before using Z-score")
print(df.describe())

# Using Z-score to detect outliers
from scipy.stats import zscore
z_scores = zscore(df.select_dtypes(include=["float64", "int64"]))
abs_z_scores = abs(z_scores)
df = df[(abs_z_scores < 3).all(axis=1)]  # Remove outliers with Z-score greater than 3

print("After using Z-score")
print(df.describe())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Before using Z-score
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   688.000000  688.000000     688.000000     688.000000  688.000000   
mean      3.845930  120.293605      72.345930      20.787791   72.507267   
std       3.279256   30.005790      12.312859      15.405391   90.106939   
min       0.000000   44.000000      24.000000       0.000000    0.000000   
25%       1.000000   99.000000      64.000000       0.000000    0.000000   
50%       3.000000  115.000000      72.000000      23.000000   43.500000   
75%       6.000000  139.000000      80.000000      32.000000  126.000000   
max      13.000000  199.000000     122.000000      60.000000  415.000000   

           

In [10]:
from sklearn.preprocessing import StandardScaler


# Separate features and target columns
X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]

X = X[['Glucose','BMI','Age','Pregnancies','DiabetesPedigreeFunction','BloodPressure','Insulin','SkinThickness']]

# Standardized numerical features
scaler = StandardScaler()
numerical_features = X.select_dtypes(include=["float64", "int64"]).columns  # 仅选择数值型特征
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Merge features and target columns
df_p = pd.concat([X, y], axis=1)

print(df_p.head())

    Glucose       BMI       Age  Pregnancies  DiabetesPedigreeFunction  \
0  0.960533  0.237764  1.496973     0.667660                  0.695034   
1 -1.168393 -0.847429 -0.174926    -0.868097                 -0.341721   
2  2.143270 -1.359020 -0.086931     1.281963                  0.864069   
3 -1.033223 -0.614888 -1.054873    -0.868097                 -1.032890   
5 -0.120826 -1.002457 -0.262921     0.360509                 -0.905174   

   BloodPressure   Insulin  SkinThickness  Outcome  
0      -0.024713 -0.815112       0.936036        1  
1      -0.546843 -0.815112       0.544551        0  
2      -0.720886 -0.815112      -1.347628        1  
3      -0.546843  0.293655       0.153066        0  
5       0.149330 -0.815112      -1.347628        0  


In [11]:
# Calculate the correlation between features and target columns
correlation = df_p.corr()["Outcome"].abs().sort_values(ascending=False)
print(correlation)

# Select features with high correlation
selected_features = correlation[correlation > 0.1].index
df_p = df_p[selected_features]

correlation = df_p.corr()["Outcome"].abs().sort_values(ascending=False)
print()
print("After selecting features")
print(correlation)

df_p.head()

Outcome                     1.000000
Glucose                     0.489363
BMI                         0.280363
Age                         0.253404
Pregnancies                 0.230425
DiabetesPedigreeFunction    0.202910
BloodPressure               0.192617
Insulin                     0.099695
SkinThickness               0.036222
Name: Outcome, dtype: float64

After selecting features
Outcome                     1.000000
Glucose                     0.489363
BMI                         0.280363
Age                         0.253404
Pregnancies                 0.230425
DiabetesPedigreeFunction    0.202910
BloodPressure               0.192617
Name: Outcome, dtype: float64


Unnamed: 0,Outcome,Glucose,BMI,Age,Pregnancies,DiabetesPedigreeFunction,BloodPressure
0,1,0.960533,0.237764,1.496973,0.66766,0.695034,-0.024713
1,0,-1.168393,-0.847429,-0.174926,-0.868097,-0.341721,-0.546843
2,1,2.14327,-1.35902,-0.086931,1.281963,0.864069,-0.720886
3,0,-1.033223,-0.614888,-1.054873,-0.868097,-1.03289,-0.546843
5,0,-0.120826,-1.002457,-0.262921,0.360509,-0.905174,0.14933


In [12]:
from google.colab import files

df_p.to_csv('data1_p.csv', index=False)
files.download('data1_p.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>