# 数据预处理

## 1 引入常用库

In [3]:
import pandas as pd 
import numpy as np 
import seaborn as sb 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## 2 使用Pandas加载并显示文件信息

In [4]:
pima = pd.read_csv("diabetes.csv")
print(pima.dtypes)
print(pima.isnull().sum())

Pregnancies                   int64
Glucose                       int64
BloodPressure               float64
SkinThickness               float64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object
Pregnancies                 0
Glucose                     0
BloodPressure               1
SkinThickness               1
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    1
Age                         0
Outcome                     0
dtype: int64


## 3 数据查看

In [5]:
print(pima.head(10))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148           72.0           35.0        0  33.6   
1            1       85           66.0           29.0        0  26.6   
2            8      183           64.0            0.0        0  23.3   
3            1       89           66.0           23.0       94  28.1   
4            0      137           40.0           35.0      168  43.1   
5            5      116           74.0            0.0        0  25.6   
6            3       78           50.0           32.0       88  31.0   
7           10      115            0.0            0.0        0  35.3   
8            2      197           70.0           45.0      543  30.5   
9            8      125           96.0            0.0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   2

## 4 替换缺失值为0

In [7]:
pima[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(np.nan,0)

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI
0,148,72.0,35.0,0,33.6
1,85,66.0,29.0,0,26.6
2,183,64.0,0.0,0,23.3
3,89,66.0,23.0,94,28.1
4,137,40.0,35.0,168,43.1
...,...,...,...,...,...
763,101,76.0,48.0,180,32.9
764,122,70.0,27.0,0,36.8
765,121,72.0,23.0,112,26.2
766,126,60.0,0.0,0,30.1


## 5 将Pregnancies转化为分类变量

In [8]:
Pregnancies1=pd.cut(pima['Pregnancies'],[0,1,5,10,250], labels=[0,1,2,3],right=False)
Pregnancies2=pd.qcut(pima['Pregnancies'],4, labels=[0,1,2,3])
print(Pregnancies1.head(10))
print(Pregnancies2.head(10))

0    2
1    1
2    2
3    1
4    0
5    2
6    1
7    3
8    1
9    2
Name: Pregnancies, dtype: category
Categories (4, int64): [0 < 1 < 2 < 3]
0    2
1    0
2    3
3    0
4    0
5    2
6    1
7    3
8    1
9    3
Name: Pregnancies, dtype: category
Categories (4, int64): [0 < 1 < 2 < 3]


## 6 划分训练集和数据集

In [9]:
x1 = pima.drop(columns='Outcome')
y1 = pima['Outcome']
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.25, random_state=0)
print(x1.shape, x_train1.shape, x_test1.shape, y1.shape, y_train1.shape, y_test1.shape)

(768, 8) (576, 8) (192, 8) (768,) (576,) (192,)
