In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [17]:
import pandas as pd
diabetes = pd.read_csv('Diabetes.csv')

print("\n HEAD \n")
print(diabetes.head())

print("\n INFO \n ")
print(diabetes.info())

print("\n DESCRIBE\n ")
print(diabetes.describe())

print("\n COUNT \n")
total_count = diabetes['Urea'].value_counts()
print(total_count)

print("\n MISSING VALUE \n ")
missval = diabetes.isnull().sum()
print(missval[missval > 0])


 HEAD 

    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  

 INFO 
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float64
 5   Cr         1000 non-null   int6

In [32]:
df_copy = diabetes

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[["M", "F", "f"]])
# Fit and transform the data
df_copy["Gender_Encoded"] = ordinal_encoder.fit_transform(df_copy[["Gender"]])

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit and transform the "City" column
encoded_data = onehot_encoder.fit_transform(diabetes[["CLASS"]])

# Convert the sparse matrix to a dense array
encoded_array = encoded_data.toarray()

# Convert to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(["CLASS"]))
df_encoded = pd.concat([df_copy, encoded_df], axis=1)

df_encoded.drop("Gender", axis=1, inplace=True)
df_encoded.drop("CLASS", axis=1, inplace=True)

print(df_encoded. head())

    ID  No_Pation  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  \
0  502      17975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
1  735      34221   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0   
2  420      47975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
3  680      87656   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
4  504      34223   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0   

   Gender_Encoded  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   
0             1.0      1.0       0.0      0.0      0.0       0.0  
1             0.0      1.0       0.0      0.0      0.0       0.0  
2             1.0      1.0       0.0      0.0      0.0       0.0  
3             1.0      1.0       0.0      0.0      0.0       0.0  
4             0.0      1.0       0.0      0.0      0.0       0.0  


In [36]:
#Data Transformation
# Min-Max Scaler/Normalization (range 0-1)
normalizer = MinMaxScaler()
df_encoded[['Cr']] = normalizer.fit_transform(df_encoded[['Cr']])
df_encoded.head()

Unnamed: 0,ID,No_Pation,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Gender_Encoded,CLASS_N,CLASS_N.1,CLASS_P,CLASS_Y,CLASS_Y.1
0,502,17975,50,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
1,735,34221,26,4.5,0.070529,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0.0,1.0,0.0,0.0,0.0,0.0
2,420,47975,50,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
3,680,87656,50,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
4,504,34223,33,7.1,0.050378,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0.0,1.0,0.0,0.0,0.0,0.0


In [37]:
# Standardization (mean=0, variance=1)
scaler = StandardScaler()
df_encoded[['AGE']] = scaler.fit_transform(df_encoded[['AGE']])
df_encoded.head()

Unnamed: 0,ID,No_Pation,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Gender_Encoded,CLASS_N,CLASS_N.1,CLASS_P,CLASS_Y,CLASS_Y.1
0,502,17975,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
1,735,34221,-3.130017,4.5,0.070529,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0.0,1.0,0.0,0.0,0.0,0.0
2,420,47975,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
3,680,87656,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0
4,504,34223,-2.334096,7.1,0.050378,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0.0,1.0,0.0,0.0,0.0,0.0


In [38]:
#Removing Outliers
# Outlier Detection and Treatment using IQR
df_encoded_copy1=df_encoded
df_encoded_copy2=df_encoded
df_encoded_copy3=df_encoded

Q1 = df_encoded_copy1['Cr'].quantile(0.25)
Q3 = df_encoded_copy1['Cr'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_encoded_copy1['Cr'] = np.where(df_encoded_copy1['Cr'] > upper_bound, upper_bound,
                        np.where(df_encoded_copy1['Cr'] < lower_bound, lower_bound, df_encoded_copy1['Cr']))

print(df_encoded_copy1.head())

    ID  No_Pation       AGE  Urea        Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221 -3.130017   4.5  0.070529    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223 -2.334096   7.1  0.050378    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI  Gender_Encoded  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   
0  24.0             1.0      1.0       0.0      0.0      0.0       0.0  
1  23.0             0.0      1.0       0.0      0.0      0.0       0.0  
2  24.0             1.0      1.0       0.0      0.0      0.0       0.0  
3  24.0             1.0      1.0       0.0      0.0      0.0       0.0  
4  21.0             0.0      1.0       0.0      0.0      0.0       0.0  


In [39]:
#Removing Outliers
# Z-score method

df_encoded_copy2['Cr_zscore'] = stats.zscore(df_encoded_copy2['Cr'])
df_encoded_copy2['Cr'] = np.where(df_encoded_copy2['Cr_zscore'].abs() > 3, np.nan, df_encoded_copy2['Cr'])  # Replace outliers with NaN
print(df_encoded_copy2.head())

    ID  No_Pation       AGE  Urea        Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221 -3.130017   4.5  0.070529    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223 -2.334096   7.1  0.050378    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI  Gender_Encoded  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   \
0  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
1  23.0             0.0      1.0       0.0      0.0      0.0       0.0   
2  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
3  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
4  21.0             0.0      1.0       0.0      0.0      0.0       0.0   

   Cr_zscore  
0  -0.805658  
1  -0.017005  
2  -0.805658  
3  -0.80

In [40]:
#Removing Outliers
# Median replacement for outliers
df_encoded_copy3['Cr_zscore'] = stats.zscore(df_encoded_copy3['Cr'])
median_salary = df_encoded_copy3['Cr'].median()
df_encoded_copy3['Cr'] = np.where(df_encoded_copy3['Cr_zscore'].abs() > 3, median_salary, df_encoded_copy3['Cr'])
print(df_encoded_copy3.head())

    ID  No_Pation       AGE  Urea        Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221 -3.130017   4.5  0.070529    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656 -0.401144   4.7  0.050378    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223 -2.334096   7.1  0.050378    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI  Gender_Encoded  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   \
0  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
1  23.0             0.0      1.0       0.0      0.0      0.0       0.0   
2  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
3  24.0             1.0      1.0       0.0      0.0      0.0       0.0   
4  21.0             0.0      1.0       0.0      0.0      0.0       0.0   

   Cr_zscore  
0  -0.805658  
1  -0.017005  
2  -0.805658  
3  -0.80

In [18]:
import pandas as pd

adult = pd.read_csv('adult.csv')

print("\n HEAD \n")
print(adult.head())

print("\n INFO \n ")
print(adult.info())

print("\n DESCRIBE\n ")
print(adult.describe())

print("\n COUNT \n")
total_count = adult['age'].value_counts()
print(total_count)

print("\n MISSING VALUE \n ")
missval = adult.isnull().sum()
print(missval[missval > 0])


 HEAD 

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-coun

In [42]:
df_copy1 = adult

# Initialize OrdinalEncoder
ordinal_encoder1 = OrdinalEncoder(categories=[["Black", "White", 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Other']])
# Fit and transform the data
df_copy1["race_Encoded"] = ordinal_encoder1.fit_transform(df_copy1[["race"]])

# Initialize OneHotEncoder
onehot_encoder1 = OneHotEncoder()

# Fit and transform the "City" column
encoded_data1 = onehot_encoder1.fit_transform(adult[["relationship"]])

# Convert the sparse matrix to a dense array
encoded_array1 = encoded_data1.toarray()

# Convert to DataFrame for better visualization
encoded_df1 = pd.DataFrame(encoded_array1, columns=onehot_encoder1.get_feature_names_out(["relationship"]))
df_encoded1 = pd.concat([df_copy1, encoded_df1], axis=1)

df_encoded1.drop("race", axis=1, inplace=True)
df_encoded1.drop("relationship", axis=1, inplace=True)

print(df_encoded1. head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation  gender  capital-gain  capital-loss  hours-per-week  \
0  Machine-op-inspct    Male             0             0              40   
1    Farming-fishing    Male             0             0              50   
2    Protective-serv    Male             0             0              40   
3  Machine-op-inspct    Male          7688             0              40   
4                  ?  Female             0             0              30   

  native-country income  race_Encoded  relationship_Husband  \

In [43]:
#Data Transformation
# Min-Max Scaler/Normalization (range 0-1)
normalizer1 = MinMaxScaler()
df_encoded1[['fnlwgt']] = normalizer1.fit_transform(df_encoded1[['fnlwgt']])
df_encoded1.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,gender,capital-gain,capital-loss,hours-per-week,native-country,income,race_Encoded,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,25,Private,0.145129,11th,7,Never-married,Machine-op-inspct,Male,0,0,40,United-States,<=50K,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,Private,0.052451,HS-grad,9,Married-civ-spouse,Farming-fishing,Male,0,0,50,United-States,<=50K,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,28,Local-gov,0.219649,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Male,0,0,40,United-States,>50K,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,44,Private,0.100153,Some-college,10,Married-civ-spouse,Machine-op-inspct,Male,7688,0,40,United-States,>50K,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,18,?,0.061708,Some-college,10,Never-married,?,Female,0,0,30,United-States,<=50K,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [44]:
# Standardization (mean=0, variance=1)
scaler = StandardScaler()
df_encoded[['AGE']] = scaler.fit_transform(df_encoded[['AGE']])
df_encoded.head()

Unnamed: 0,ID,No_Pation,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Gender_Encoded,CLASS_N,CLASS_N.1,CLASS_P,CLASS_Y,CLASS_Y.1,Cr_zscore
0,502,17975,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0,-0.805658
1,735,34221,-3.130017,4.5,0.070529,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.017005
2,420,47975,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0,-0.805658
3,680,87656,-0.401144,4.7,0.050378,4.9,4.2,0.9,2.4,1.4,0.5,24.0,1.0,1.0,0.0,0.0,0.0,0.0,-0.805658
4,504,34223,-2.334096,7.1,0.050378,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.805658


In [None]:
#Removing Outliers
# Outlier Detection and Treatment using IQR
df_encoded_copy11=df_encoded1
df_encoded_copy22=df_encoded1
df_encoded_copy33=df_encoded1

Q11 = df_encoded_copy11['fnlwgt'].quantile(0.25)
Q33 = df_encoded_copy11['fnlwgt'].quantile(0.75)
IQR1 = Q3 - Q1
lower_bound1 = Q11 - 1.5 * IQR
upper_bound1 = Q33 + 1.5 * IQR
df_encoded_copy11['fnlwgt'] = np.where(df_encoded_copy11['Cr'] > upper_bound1, upper_bound1,
                        np.where(df_encoded_copy11['Cr'] < lower_bound1, lower_bound1, df_encoded_copy11['Cr']))

print(df_encoded_copy11.head())

In [None]:
#Removing Outliers
# Z-score method

df_encoded_copy2['Cr_zscore'] = stats.zscore(df_encoded_copy2['Cr'])
df_encoded_copy2['Cr'] = np.where(df_encoded_copy2['Cr_zscore'].abs() > 3, np.nan, df_encoded_copy2['Cr'])  # Replace outliers with NaN
print(df_encoded_copy2.head())

In [None]:
#Removing Outliers
# Median replacement for outliers
df_encoded_copy3['Cr_zscore'] = stats.zscore(df_encoded_copy3['Cr'])
median_salary = df_encoded_copy3['Cr'].median()
df_encoded_copy3['Cr'] = np.where(df_encoded_copy3['Cr_zscore'].abs() > 3, median_salary, df_encoded_copy3['Cr'])
print(df_encoded_copy3.head())