In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [2]:
def createdata():
  data = {
      'Age': np.random.randint(18, 70, size=20),
      'Salary': np.random.randint(30000, 120000, size=20),
      'Purchased': np.random.choice([0, 1], size=20),
      'Gender': np.random.choice(['Male', 'Female'], size=20),
      'City': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)
  }

  df = pd.DataFrame(data)
  return df

In [3]:
df = createdata()
df.head(10)
df.shape

(20, 5)

In [4]:
# Introduce some missing values for demonstration
df.loc[5, 'Age'] = np.nan
df.loc[10, 'Salary'] = np.nan
df.head(10)
# Basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        19 non-null     float64
 1   Salary     19 non-null     float64
 2   Purchased  20 non-null     int64  
 3   Gender     20 non-null     object 
 4   City       20 non-null     object 
dtypes: float64(2), int64(1), object(2)
memory usage: 932.0+ bytes
None


In [5]:
# Summary statistics
print(df.describe())

             Age         Salary  Purchased
count  19.000000      19.000000  20.000000
mean   46.684211   68556.157895   0.500000
std    14.552788   26876.907296   0.512989
min    25.000000   35970.000000   0.000000
25%    32.000000   46756.500000   0.000000
50%    53.000000   55569.000000   0.500000
75%    58.500000   91856.500000   1.000000
max    64.000000  118636.000000   1.000000


In [7]:
#Code to Find Missing Values
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])


Age       1
Salary    1
dtype: int64


In [8]:
#Set the values to some value (zero, the mean, the median, etc.).
# Step 1: Create an instance of SimpleImputer with the median strategy for Age and mean stratergy for Salary
imputer1 = SimpleImputer(strategy="median")
imputer2 = SimpleImputer(strategy="mean")

df_copy=df

# Step 2: Fit the imputer on the "Age" and "Salary"column
# Note: SimpleImputer expects a 2D array, so we reshape the column
imputer1.fit(df_copy[["Age"]])
imputer2.fit(df_copy[["Salary"]])

# Step 3: Transform (fill) the missing values in the "Age" and "Salary"c column
df_copy["Age"] = imputer1.transform(df[["Age"]])
df_copy["Salary"] = imputer2.transform(df[["Salary"]])

# Verify that there are no missing values left
print(df_copy["Age"].isnull().sum())
print(df_copy["Salary"].isnull().sum())

0
0


In [9]:
#Handling Categorical Attributes
#Using Ordinal Encoding for gender COlumn and One-Hot Encoding for City Column

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[["Male", "Female"]])
# Fit and transform the data
df_copy["Gender_Encoded"] = ordinal_encoder.fit_transform(df_copy[["Gender"]])

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit and transform the "City" column
encoded_data = onehot_encoder.fit_transform(df[["City"]])

# Convert the sparse matrix to a dense array
encoded_array = encoded_data.toarray()

# Convert to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(["City"]))
df_encoded = pd.concat([df_copy, encoded_df], axis=1)

df_encoded.drop("Gender", axis=1, inplace=True)
df_encoded.drop("City", axis=1, inplace=True)

print(df_encoded. head())

    Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  City_New York  \
0  64.0  105465.0          1             0.0               0.0            1.0   
1  57.0   47883.0          0             1.0               0.0            1.0   
2  59.0   45630.0          1             0.0               0.0            0.0   
3  32.0   43850.0          0             1.0               0.0            1.0   
4  25.0   55131.0          1             0.0               0.0            1.0   

   City_San Francisco  
0                 0.0  
1                 0.0  
2                 1.0  
3                 0.0  
4                 0.0  


In [10]:
#Data Transformation
# Min-Max Scaler/Normalization (range 0-1)
#Pros: Keeps all data between 0 and 1; ideal for distance-based models.
#Cons: Can distort data distribution, especially with extreme outliers.
normalizer = MinMaxScaler()
df_encoded[['Salary']] = normalizer.fit_transform(df_encoded[['Salary']])
df_encoded.head()

Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,64.0,0.840672,1,0.0,0.0,1.0,0.0
1,57.0,0.14411,0,1.0,0.0,1.0,0.0
2,59.0,0.116856,1,0.0,0.0,0.0,1.0
3,32.0,0.095323,0,1.0,0.0,1.0,0.0
4,25.0,0.231788,1,0.0,0.0,1.0,0.0


In [11]:
# Standardization (mean=0, variance=1)
#Pros: Works well for normally distributed data; suitable for many models.
#Cons: Sensitive to outliers.
scaler = StandardScaler()
df_encoded[['Age']] = scaler.fit_transform(df_encoded[['Age']])
df_encoded.head()


Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,1.225275,0.840672,1,0.0,0.0,1.0,0.0
1,0.72075,0.14411,0,1.0,0.0,1.0,0.0
2,0.8649,0.116856,1,0.0,0.0,0.0,1.0
3,-1.081125,0.095323,0,1.0,0.0,1.0,0.0
4,-1.58565,0.231788,1,0.0,0.0,1.0,0.0


In [12]:
#Removing Outliers
# Outlier Detection and Treatment using IQR
#Pros: Simple and effective for mild outliers.
#Cons: May overly reduce variation if there are many extreme outliers.
df_encoded_copy1=df_encoded
df_encoded_copy2=df_encoded
df_encoded_copy3=df_encoded

Q1 = df_encoded_copy1['Salary'].quantile(0.25)
Q3 = df_encoded_copy1['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_encoded_copy1['Salary'] = np.where(df_encoded_copy1['Salary'] > upper_bound, upper_bound,
                        np.where(df_encoded_copy1['Salary'] < lower_bound, lower_bound, df_encoded_copy1['Salary']))

print(df_encoded_copy1.head())



        Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  \
0  1.225275  0.840672          1             0.0               0.0   
1  0.720750  0.144110          0             1.0               0.0   
2  0.864900  0.116856          1             0.0               0.0   
3 -1.081125  0.095323          0             1.0               0.0   
4 -1.585650  0.231788          1             0.0               0.0   

   City_New York  City_San Francisco  
0            1.0                 0.0  
1            1.0                 0.0  
2            0.0                 1.0  
3            1.0                 0.0  
4            1.0                 0.0  


In [13]:
#Removing Outliers
# Z-score method
#Pros: Good for normally distributed data.
#Cons: Not suitable for non-normal data; may miss outliers in skewed distributions.

df_encoded_copy2['Salary_zscore'] = stats.zscore(df_encoded_copy2['Salary'])
df_encoded_copy2['Salary'] = np.where(df_encoded_copy2['Salary_zscore'].abs() > 3, np.nan, df_encoded_copy2['Salary'])  # Replace outliers with NaN
print(df_encoded_copy2.head())

        Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  \
0  1.225275  0.840672          1             0.0               0.0   
1  0.720750  0.144110          0             1.0               0.0   
2  0.864900  0.116856          1             0.0               0.0   
3 -1.081125  0.095323          0             1.0               0.0   
4 -1.585650  0.231788          1             0.0               0.0   

   City_New York  City_San Francisco  Salary_zscore  
0            1.0                 0.0       1.447538  
1            1.0                 0.0      -0.810786  
2            0.0                 1.0      -0.899147  
3            1.0                 0.0      -0.968957  
4            1.0                 0.0      -0.526525  


In [14]:
#Removing Outliers
# Median replacement for outliers
#Pros: Keeps distribution shape intact, useful when capping isn’t feasible.
#Cons: May distort data if outliers represent real phenomena.
df_encoded_copy3['Salary_zscore'] = stats.zscore(df_encoded_copy3['Salary'])
median_salary = df_encoded_copy3['Salary'].median()
df_encoded_copy3['Salary'] = np.where(df_encoded_copy3['Salary_zscore'].abs() > 3, median_salary, df_encoded_copy3['Salary'])
print(df_encoded_copy3.head())

        Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  \
0  1.225275  0.840672          1             0.0               0.0   
1  0.720750  0.144110          0             1.0               0.0   
2  0.864900  0.116856          1             0.0               0.0   
3 -1.081125  0.095323          0             1.0               0.0   
4 -1.585650  0.231788          1             0.0               0.0   

   City_New York  City_San Francisco  Salary_zscore  
0            1.0                 0.0       1.447538  
1            1.0                 0.0      -0.810786  
2            0.0                 1.0      -0.899147  
3            1.0                 0.0      -0.968957  
4            1.0                 0.0      -0.526525  


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

def preprocess_data(file_path):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return None

    # 1. Data Cleaning
    # 1.1 Handling Missing Values (using SimpleImputer)
    numerical_cols = df.select_dtypes(include=np.number).columns
    categorical_cols = df.select_dtypes(exclude=np.number).columns

    imputer_num = SimpleImputer(strategy='median')  # Use median for numerical
    df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

    imputer_cat = SimpleImputer(strategy='most_frequent') # Use most frequent for categorical
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])


    # 1.2 Handling Categorical Data (using OneHotEncoding)
    for col in categorical_cols:
        if df[col].nunique() <= 10 : # Apply one-hot only to columns with less than or equal to 10 unique values
          onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
          encoded_data = onehot_encoder.fit_transform(df[[col]])
          encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out([col]))
          df = pd.concat([df, encoded_df], axis=1).drop(columns=col)
        else :
          ordinal_encoder = OrdinalEncoder()
          df[col] = ordinal_encoder.fit_transform(df[[col]])


    # 1.3 Handling Outliers (using Z-score method)
    for col in numerical_cols:
        df[col + '_zscore'] = np.abs(stats.zscore(df[col]))
        df[col] = np.where(df[col + '_zscore'] > 3, df[col].median(), df[col])
        df = df.drop(col + '_zscore', axis=1)


    # 2. Data Transformations
    # 2.1 Min-Max Scaling
    min_max_scaler = MinMaxScaler()
    df[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])


    # 2.2 Standard Scaling
    standard_scaler = StandardScaler()
    df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

    return df


# Preprocess the Diabetes dataset
diabetes_df = preprocess_data('/content/Dataset of Diabetes .csv')

if diabetes_df is not None:
    print("Preprocessed Diabetes Data:")
    print(diabetes_df.head())

Preprocessed Diabetes Data:
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.672140  -0.157106 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
1  1.641852  -0.124010  0.156936 -0.175155 -0.050153 -1.341068 -0.978321   
2  0.330868  -0.095991 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
3  1.412950  -0.015153 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
4  0.680463  -0.124006 -2.402429  1.228490 -0.677551 -1.341068  0.079400   

         TG       HDL       LDL      VLDL       BMI  Gender_F  Gender_M  \
0 -1.185936  3.266253 -1.158715 -0.497302 -1.136091       1.0       0.0   
1 -0.744645 -0.119680 -0.465934 -0.435617 -1.341671       0.0       1.0   
2 -1.185936  3.266253 -1.158715 -0.497302 -1.136091       1.0       0.0   
3 -1.185936  3.266253 -1.158715 -0.497302 -1.136091       1.0       0.0   
4 -1.097678 -0.901049 -0.564903 -0.558987 -1.752832       0.0       1.0   

   Gender_f  CLASS_N  CLASS_N   CLASS_P  CLASS_Y  CLASS_Y   
0  

In [22]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

def preprocess_data(file_path):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return None

    # 1. Data Cleaning
    # 1.1 Handling Missing Values (using SimpleImputer)
    numerical_cols = df.select_dtypes(include=np.number).columns
    categorical_cols = df.select_dtypes(exclude=np.number).columns

    imputer_num = SimpleImputer(strategy='median')  # Use median for numerical
    df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

    imputer_cat = SimpleImputer(strategy='most_frequent') # Use most frequent for categorical
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])


    # 1.2 Handling Categorical Data (using OneHotEncoding)
    for col in categorical_cols:
        if df[col].nunique() <= 10 : # Apply one-hot only to columns with less than or equal to 10 unique values
          onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
          encoded_data = onehot_encoder.fit_transform(df[[col]])
          encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out([col]))
          df = pd.concat([df, encoded_df], axis=1).drop(columns=col)
        else :
          ordinal_encoder = OrdinalEncoder()
          df[col] = ordinal_encoder.fit_transform(df[[col]])


    # 1.3 Handling Outliers (using Z-score method)
    for col in numerical_cols:
        df[col + '_zscore'] = np.abs(stats.zscore(df[col]))
        df[col] = np.where(df[col + '_zscore'] > 3, df[col].median(), df[col])
        df = df.drop(col + '_zscore', axis=1)


    # 2. Data Transformations
    # 2.1 Min-Max Scaling
    min_max_scaler = MinMaxScaler()
    df[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])


    # 2.2 Standard Scaling
    standard_scaler = StandardScaler()
    df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

    return df


# Preprocess the Adult Income dataset
adult_df = preprocess_data('/content/adult.csv')

if adult_df is not None:
    print("\nPreprocessed Adult Income Data:")
    print(adult_df.head())


Preprocessed Adult Income Data:
        age    fnlwgt  education  educational-num  occupation  capital-gain  \
0 -1.003723  0.444012        1.0        -1.265060         7.0     -0.236000   
1 -0.034522 -1.009761       11.0        -0.457705         5.0     -0.236000   
2 -0.780061  1.612959        7.0         0.753327        11.0     -0.236000   
3  0.412802 -0.261490       15.0        -0.054028         7.0      3.191145   
4 -1.525600 -0.864551       15.0        -0.054028         0.0     -0.236000   

   capital-loss  hours-per-week  native-country  workclass_?  ...  \
0      -0.03383       -0.000355            39.0          0.0  ...   
1      -0.03383        0.897615            39.0          0.0  ...   
2      -0.03383       -0.000355            39.0          0.0  ...   
3      -0.03383       -0.000355            39.0          0.0  ...   
4      -0.03383       -0.898325            39.0          1.0  ...   

   relationship_Wife  race_Amer-Indian-Eskimo  race_Asian-Pac-Islander  \
0  