In [None]:
import pandas as pd
import numpy as np

**Diabetes dataset**

In [None]:
diabetes=pd.read_csv('/content/Dataset of Diabetes .csv')
print("Head",diabetes.head(),sep="\n")
print("Info",diabetes.info(),sep="\n")
print("Describe",diabetes.describe(),sep="\n")

Head
    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float64
 5   Cr         1000 non-null   int64  
 6   HbA1c

In [None]:
#missing values
mv=diabetes.isnull().sum()
print("Missing values",mv[mv>0],sep="\n")

Missing values
Series([], dtype: int64)


In [None]:
#Handling categorical data

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

diabetes = diabetes.replace(to_replace='f', value='F', regex=True)

# Initialize OrdinalEncoder for Gender column (F -> 0, M -> 1)
ordinal_encoder = OrdinalEncoder(categories=[["F", "M"]])

# Fit and transform the Gender column to create a new column 'Gender_Encoded'
diabetes["Gender_Encoded"] = ordinal_encoder.fit_transform(diabetes[["Gender"]])

# Initialize OneHotEncoder for CLASS column
onehot_encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False gives a dense matrix

# Fit and transform the CLASS column
encoded_data = onehot_encoder.fit_transform(diabetes[["CLASS"]])

# Convert the resulting numpy array to a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(["CLASS"]))

# Concatenate the encoded columns with the original DataFrame
diabetes_encoded = pd.concat([diabetes, encoded_df], axis=1)

# Drop the original categorical columns (Gender and CLASS) since we have encoded versions
diabetes_encoded.drop(["Gender", "CLASS"], axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
print(diabetes_encoded.head())


    ID  No_Pation  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  \
0  502      17975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
1  735      34221   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0   
2  420      47975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
3  680      87656   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
4  504      34223   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0   

   Gender_Encoded  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   
0             0.0      1.0       0.0      0.0      0.0       0.0  
1             1.0      1.0       0.0      0.0      0.0       0.0  
2             0.0      1.0       0.0      0.0      0.0       0.0  
3             0.0      1.0       0.0      0.0      0.0       0.0  
4             1.0      1.0       0.0      0.0      0.0       0.0  


In [None]:
#Handling Outliers
def detect_outliers(data):
    outliers = []
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    z_score=(data-mean)//std
    for i in range(len(data)):
        if np.abs(z_score[i])>threshold:
            outliers.append(i)
    return outliers

print("For Age Column:")
print(detect_outliers(diabetes['AGE']))


For Age Column:
[1, 86, 953, 979]


In [15]:
#Data Transformations min max scalar
def data_transformation(data):
  min=np.min(data)
  max=np.max(data)
  scaled=[]
  for i in data:
    scaled.append((i-min)/(max-min))
  print(scaled)

print("For BMI column, first 50 values")
print(data_transformation(diabetes['BMI'][:50]))

For BMI column, first 50 values
[1.0, 0.8, 1.0, 1.0, 0.4, 0.4, 1.0, 1.0, 0.4, 1.0, 0.8, 0.4, 0.6, 0.8, 1.0, 0.4, 0.4, 0.6, 1.0, 0.8, 1.0, 1.0, 1.0, 0.8, 0.4, 0.6, 0.0, 0.6, 0.6, 0.6, 0.8, 1.0, 0.6, 1.0, 0.8, 0.6, 0.4, 0.4, 1.0, 0.8, 1.0, 0.4, 0.6, 0.4, 1.0, 0.4, 0.8, 1.0, 0.4, 0.6]
None


**Adult income data sets**

In [None]:
income=pd.read_csv('/content/adult.csv')
print("Head",income.head(),sep="\n")
print("Info",income.info(),sep="\n")
print("Describe",income.describe(),sep="\n")

Head
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country 

In [None]:
#missing values
mv=income.isnull().sum()
print("Missing values",mv[mv>0],sep="\n")

Missing values
Series([], dtype: int64)


In [None]:
#Handling categorical data

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Initialize OrdinalEncoder for Gender column
ordinal_encoder = OrdinalEncoder(categories=[["Male", "Female"]])

# Fit and transform the Gender column
income["Gender_Encoded"] = ordinal_encoder.fit_transform(income[["gender"]])

# Initialize OneHotEncoder for Workclass column
onehot_encoder = OneHotEncoder()

# Fit and transform the Workclass column
encoded_data = onehot_encoder.fit_transform(income[["workclass"]])

# Convert the sparse matrix to a dense array
encoded_array = encoded_data.toarray()

# Convert to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(["workclass"]))

# Concatenate the encoded columns with the original DataFrame
income_encoded = pd.concat([income, encoded_df], axis=1)

# Drop the original categorical columns (gender and workclass)
income_encoded.drop(["gender", "workclass"], axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
print(income_encoded.head())


   age  fnlwgt     education  educational-num      marital-status  \
0   25  226802          11th                7       Never-married   
1   38   89814       HS-grad                9  Married-civ-spouse   
2   28  336951    Assoc-acdm               12  Married-civ-spouse   
3   44  160323  Some-college               10  Married-civ-spouse   
4   18  103497  Some-college               10       Never-married   

          occupation relationship   race  capital-gain  capital-loss  ...  \
0  Machine-op-inspct    Own-child  Black             0             0  ...   
1    Farming-fishing      Husband  White             0             0  ...   
2    Protective-serv      Husband  White             0             0  ...   
3  Machine-op-inspct      Husband  Black          7688             0  ...   
4                  ?    Own-child  White             0             0  ...   

   Gender_Encoded workclass_? workclass_Federal-gov  workclass_Local-gov  \
0             0.0         0.0                 

In [None]:
#Handling Outliers
def detect_outliers(data):
    outliers = []
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    for i in range(len(data)):
        if data[i] < lower_bound or data[i] > upper_bound:
            outliers.append(i)
    return outliers

print("For Age Column:")
print(detect_outliers(income['age']))

For Age Column:
[193, 234, 899, 926, 951, 1079, 1398, 1834, 2085, 2290, 2982, 3496, 3668, 4455, 4646, 4658, 6402, 6577, 6757, 6915, 6959, 6976, 6979, 7160, 7170, 7414, 7419, 7539, 7547, 7937, 8206, 8313, 8427, 8955, 8982, 9018, 9038, 9081, 9279, 9769, 9888, 10039, 10199, 10223, 10735, 11289, 11328, 11410, 11837, 11871, 11881, 11940, 12060, 12229, 12446, 13025, 13958, 14033, 14263, 14299, 14431, 14568, 14591, 14740, 15088, 15098, 15408, 15934, 15963, 16003, 16106, 16148, 16251, 16355, 16503, 16711, 17199, 17321, 17449, 18216, 18584, 19035, 19172, 19187, 19492, 19619, 19818, 20058, 20244, 20351, 20390, 21001, 21115, 21385, 21553, 21572, 21651, 21687, 22281, 22454, 22495, 22513, 22720, 22905, 23029, 23762, 24001, 24153, 24457, 24662, 24712, 24803, 24975, 25087, 25244, 25254, 25752, 26405, 26491, 26826, 27380, 27519, 27793, 27813, 28012, 28277, 28732, 28773, 29111, 29256, 29306, 29307, 29576, 29977, 30209, 30385, 30440, 30885, 30992, 31037, 31184, 31637, 31943, 32173, 32583, 32804, 33043, 

In [16]:
#Data Transformations Standard Scaler
def data_transformation(data):
  mean=np.mean(data)
  std=np.std(data)
  scaled=[]
  for i in data:
    scaled.append((i-mean)/std)
  print(scaled)

print("For capital-gain column, first 50 values")
print(data_transformation(income['capital-gain'][:50]))

For capital-gain column, first 50 values
[-0.37686781529155805, -0.37686781529155805, -0.37686781529155805, 3.3181785602141023, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, 1.114512114884569, -0.37686781529155805, -0.37686781529155805, 2.7077845829908727, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, 1.114512114884569, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, 3.13073474043689, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, 3.3181785602141023, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.37686781529155805, -0.376