In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Diabetes

In [34]:
df = pd.read_csv(r"data/original/CVD_cleaned.csv")
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [35]:
actual_types = df.applymap(type)

# Display a summary of the unique types for each column
column_types = df.apply(lambda x: x.apply(type).unique())
print(column_types)

  General_Health        Checkup       Exercise  Heart_Disease    Skin_Cancer  \
0  <class 'str'>  <class 'str'>  <class 'str'>  <class 'str'>  <class 'str'>   

    Other_Cancer     Depression       Diabetes      Arthritis            Sex  \
0  <class 'str'>  <class 'str'>  <class 'str'>  <class 'str'>  <class 'str'>   

    Age_Category      Height_(cm)      Weight_(kg)              BMI  \
0  <class 'str'>  <class 'float'>  <class 'float'>  <class 'float'>   

  Smoking_History Alcohol_Consumption Fruit_Consumption  \
0   <class 'str'>     <class 'float'>   <class 'float'>   

  Green_Vegetables_Consumption FriedPotato_Consumption  
0              <class 'float'>         <class 'float'>  


In [36]:
for column in df.columns:
    print(f"* {column}:")
    print(f"\t unique value = {df[column].unique()}")

* General_Health:
	 unique value = ['Poor' 'Very Good' 'Good' 'Fair' 'Excellent']
* Checkup:
	 unique value = ['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']
* Exercise:
	 unique value = ['No' 'Yes']
* Heart_Disease:
	 unique value = ['No' 'Yes']
* Skin_Cancer:
	 unique value = ['No' 'Yes']
* Other_Cancer:
	 unique value = ['No' 'Yes']
* Depression:
	 unique value = ['No' 'Yes']
* Diabetes:
	 unique value = ['No' 'Yes' 'No, pre-diabetes or borderline diabetes'
 'Yes, but female told only during pregnancy']
* Arthritis:
	 unique value = ['Yes' 'No']
* Sex:
	 unique value = ['Female' 'Male']
* Age_Category:
	 unique value = ['70-74' '60-64' '75-79' '80+' '65-69' '50-54' '45-49' '18-24' '30-34'
 '55-59' '35-39' '40-44' '25-29']
* Height_(cm):
	 unique value = [150. 165. 163. 180. 191. 183. 175. 160. 168. 178. 152. 157. 188. 185.
 170. 173. 155. 193. 196. 206. 198. 140. 135. 145. 147. 142. 201. 218.
 124. 203. 137. 122. 216. 224. 

In [37]:
# Replace specific values to simplify them
df['Diabetes'] = df['Diabetes'].replace({
    'No, pre-diabetes or borderline diabetes': 'No',
    'Yes, but female told only during pregnancy': 'Yes'
})

In [48]:
# # 1. Separate categorical (string) and numerical (float) columns
# categorical_cols = df.select_dtypes(include='object').columns
# numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
#
# # 2. One-hot encode the categorical columns
# df_onehot = pd.get_dummies(df[categorical_cols], drop_first=False)
#
# # 3. Concatenate numeric columns with the one-hot encoded categorical columns
# df_final = pd.concat([df[numeric_cols], df_onehot], axis=1)
#
# # 4. Display the new DataFrame
# df_final.head()
#
# # Save converted data
# df_final.to_csv('converted_data.csv', index=False)

In [38]:
# 1. Separate categorical (string) and numerical (float) columns
categorical_cols = df.select_dtypes(include='object').columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# 2. Initialize an empty DataFrame to store the processed categorical columns
df_processed_categorical = pd.DataFrame()

# 3. Iterate through each categorical column and decide how to encode it
for col in categorical_cols:
    if df[col].nunique() == 2:  # If the column has only two unique values
        # Use LabelEncoder for binary columns (0/1)
        le = LabelEncoder()
        df_processed_categorical[col] = le.fit_transform(df[col])
    else:
        # Use one-hot encoding for columns with more than two categories
        df_onehot = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df_processed_categorical = pd.concat([df_processed_categorical, df_onehot], axis=1)

for col in numeric_cols:
    df[col] = df[col] / df[col].max()

# 4. Concatenate numeric columns with the processed categorical columns
df_final = pd.concat([df[numeric_cols], df_processed_categorical], axis=1)

# 5. Display the new DataFrame
df_final.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health_Excellent,General_Health_Fair,General_Health_Good,...,Age_Category_40-44,Age_Category_45-49,Age_Category_50-54,Age_Category_55-59,Age_Category_60-64,Age_Category_65-69,Age_Category_70-74,Age_Category_75-79,Age_Category_80+,Smoking_History
0,0.622407,0.11146,0.146381,0.0,0.25,0.125,0.09375,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.684647,0.263156,0.284808,0.0,0.25,0.0,0.03125,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.676349,0.301857,0.336958,0.133333,0.1,0.023438,0.125,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.746888,0.318886,0.289238,0.0,0.25,0.234375,0.0625,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.792531,0.301857,0.245344,0.0,0.066667,0.03125,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,1


In [40]:
df_positive = df_final[df_final['Diabetes'] == 1]
df_negative = df_final[df_final['Diabetes'] == 0]
print(len(df_positive))
print(len(df_negative))

42817
266037


In [41]:
sample_from_each = 10000
df_final = pd.concat([df_positive.sample(n=sample_from_each, random_state=1), df_negative.sample(n=sample_from_each, random_state=1)])

In [42]:
# 6. Save the converted data
df_final.to_csv('data/converted_diabetes_data.csv', index=False)

# Glass Dataset

In [18]:
glass_df = pd.read_csv('data/original/glass.csv')
glass_df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,4
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,4
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,4
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,4


In [19]:
glass_df['Type'] -= 1

In [20]:
range_df = glass_df.agg(['min', 'max'])
range_df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,6


In [21]:
for col in glass_df.columns[:-1]:
    glass_df[col] = glass_df[col] / glass_df[col].max()

In [22]:
glass_df.to_csv('data/converted_glass_data.csv', index=False)

# car_evaluation

In [4]:
car_df = pd.read_csv("data/original/car_evaluation.csv")
car_df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [5]:
for column in car_df.columns:
    print(f"* {column}:")
    print(f"\t unique value = {car_df[column].unique()}")

* vhigh:
	 unique value = ['vhigh' 'high' 'med' 'low']
* vhigh.1:
	 unique value = ['vhigh' 'high' 'med' 'low']
* 2:
	 unique value = ['2' '3' '4' '5more']
* 2.1:
	 unique value = ['2' '4' 'more']
* small:
	 unique value = ['small' 'med' 'big']
* low:
	 unique value = ['med' 'high' 'low']
* unacc:
	 unique value = ['unacc' 'acc' 'vgood' 'good']


In [6]:
converted_car_df = pd.DataFrame()
for col in car_df.columns[:-1]:
    df_onehot = pd.get_dummies(car_df[col], prefix=col, drop_first=False)
    converted_car_df = pd.concat([converted_car_df, df_onehot], axis=1)

converted_car_df = pd.concat([converted_car_df, car_df['unacc']], axis=1)
converted_car_df['unacc'] = pd.factorize(converted_car_df['unacc'])[0]

In [7]:
converted_car_df.head()

Unnamed: 0,vhigh_high,vhigh_low,vhigh_med,vhigh_vhigh,vhigh.1_high,vhigh.1_low,vhigh.1_med,vhigh.1_vhigh,2_2,2_3,...,2.1_2,2.1_4,2.1_more,small_big,small_med,small_small,low_high,low_low,low_med,unacc
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,0
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0


In [9]:
counts = converted_car_df['unacc'].value_counts()
counts

0    1209
1     384
3      69
2      65
Name: unacc, dtype: int64

In [10]:
converted_car_df.to_csv('data/converted_car_data.csv', index=False)

# winequality-red

In [23]:
wine_df = pd.read_csv("data/original/winequality-red.csv")
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [24]:
for col in wine_df.columns[:-1]:
    wine_df[col] = wine_df[col] / wine_df[col].max()

wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.465409,0.443038,0.0,0.122581,0.124386,0.152778,0.117647,0.994132,0.875312,0.28,0.630872,5
1,0.490566,0.556962,0.0,0.167742,0.160393,0.347222,0.231834,0.993135,0.798005,0.34,0.657718,5
2,0.490566,0.481013,0.04,0.148387,0.150573,0.208333,0.186851,0.993335,0.812968,0.325,0.657718,5
3,0.704403,0.177215,0.56,0.122581,0.12275,0.236111,0.207612,0.994331,0.78803,0.29,0.657718,6
4,0.465409,0.443038,0.0,0.122581,0.124386,0.152778,0.117647,0.994132,0.875312,0.28,0.630872,5


In [31]:
wine_df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [35]:
wine_df = wine_df[wine_df['quality'].isin([5, 6, 7])]
wine_df['quality'] -= 5

In [36]:
wine_df['quality'].value_counts()

0    681
1    638
2    199
Name: quality, dtype: int64

In [38]:
wine_df.to_csv('data/converted_wine_data.csv', index=False)