In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
USELESS_DATA_BOUNDARY= 0.3

In [3]:
class DataSet(object):
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path)
        #print(self.data)w
        self.processed_data = self.data
        #self.missing_value_process()

        #self.raw_predictor = self.data.iloc[:,:-1].values
        #self.raw_response = self.data.iloc[:,-1].values

        self.useless_value_process()
        self.missing_value_process()
        self.tag_transfer()
        self.imputer_process()
        self.normalize()

    def useless_value_process(self):
        col_del = ['examide', 'citoglipton', 'glimepiride-pioglitazone','encounter_id','patient_nbr']
        self.processed_data.drop(col_del, axis=1, inplace = True)
    
    def missing_value_process(self):
        row_num = len(self.data)
        col_num = len(self.data.columns)
        cols_to_drop = []
        for col in range(col_num):
            qm_num = 0
            for row in range(row_num):
                if str(self.data.iat[row,col]) == "?":
                    qm_num += 1

            #print(qm_num/row_num)
            if qm_num/row_num >= USELESS_DATA_BOUNDARY:
                #print("true")
                cols_to_drop.append(self.data.columns[col])
        
        self.processed_data.drop(columns=cols_to_drop, axis=1, inplace = True)

    def imputer_process(self):
        # Replace the question marks with the most frequently appeared value for each feature
        imp = SimpleImputer(missing_values= -1, strategy='most_frequent')
        imp.fit(self.processed_data)


    def tag_transfer(self): 

        le = LabelEncoder()
        #self.processed_data.replace('?', -1, inplace=True)
        value_to_keep = "?"

        # 遍历DataFrame的每一列
        for column in self.processed_data.columns:
            # 如果列的数据类型是对象（通常意味着它是分类类型）
            if self.processed_data[column].dtype == 'object':
                # 对列应用 LabelEncoder
                values_to_encode = self.processed_data[column] != value_to_keep
                #original_column = self.processed_data[column].copy()
                # 对非特定值应用 LabelEncoder
                encoded_values = le.fit_transform(self.processed_data.loc[values_to_encode, column])
                self.processed_data[column] = le.fit_transform(self.processed_data[column])
                #self.processed_data[column] = pd.Series(encoded_values, index=self.processed_data.index[values_to_encode]).astype(int)
                #self.processed_data.loc[~values_to_encode, column] = original_column[~values_to_encode]
                self.processed_data[column].fillna(self.processed_data[column], inplace=True)

        #print(self.processed_data)
    #I have changed        

        #self.processed_data = self.raw_data.dropna()

    def normalize(self):

        # Normalize data using StandardScaler
        scaler = StandardScaler()
        normalized_data = scaler.fit_transform(self.processed_data)

        # Print normalized data
        print(normalized_data)




In [4]:
train_data = DataSet('diabetic_data.csv')
test_data = DataSet('diabetic_data_test.csv')

#train_data.data
train_data.processed_data, test_data.processed_data

train_data.processed_data.to_csv("test_csv.csv",sep=';',index=False)


[[ 0.42755739 -0.92739675 -3.82459963 ...  0.92659052 -1.82986799
   0.8368795 ]
 [ 0.42755739 -0.92739675 -3.19727694 ... -1.07922537  0.54648751
  -0.62496982]
 [-1.70370587 -0.92739675 -2.56995426 ...  0.92659052  0.54648751
   0.8368795 ]
 ...
 [ 0.42755739  1.07803147  0.56665918 ... -1.07922537  0.54648751
   0.8368795 ]
 [ 0.42755739 -0.92739675  1.19398187 ... -1.07922537  0.54648751
   0.8368795 ]
 [ 0.42755739  1.07803147  0.56665918 ...  0.92659052 -1.82986799
   0.8368795 ]]
[[ 0.43786174 -0.9237946  -3.83040391 ...  0.91850935 -1.81546287
   0.83385164]
 [-1.70347364 -0.9237946  -1.31839253 ...  0.91850935  0.55082371
  -0.63433316]
 [-2.77414133 -0.9237946  -0.69038969 ... -1.08872055  0.55082371
   0.83385164]
 ...
 [ 0.43786174  1.08249171 -1.31839253 ... -1.08872055  0.55082371
   0.83385164]
 [ 0.43786174 -0.9237946   1.19361885 ... -1.08872055  0.55082371
  -2.10251797]
 [-1.70347364 -0.9237946  -0.06238684 ... -1.08872055  0.55082371
  -0.63433316]]


In [11]:
# export the normalized data as an Excel file
df = pd.read_csv('test_csv.csv')
df.to_excel('normalized new dataset.xlsx')

In [9]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
train_data = pd.read_csv('diabetic_data.csv')
test_data = pd.read_csv('diabetic_data_test.csv')

predictor = train_data.iloc[:,:-1].values
response = train_data.iloc[:,-1].values

predictor,response 

(array([[149190, 55629189, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [64410, 86047875, 'AfricanAmerican', ..., 'No', 'No', 'Yes'],
        [500364, 82442376, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        ...,
        [443854148, 41088789, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [443857166, 31693671, 'Caucasian', ..., 'No', 'Ch', 'Yes'],
        [443867222, 175429310, 'Caucasian', ..., 'No', 'No', 'No']],
       dtype=object),
 array(['>30', 'NO', 'NO', ..., 'NO', 'NO', 'NO'], dtype=object))