# Importing packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

# Data Cleaning Methods with class & Objects

In [2]:
class DataCleaning:
    def __init__(self, df):
        self.df = df.copy()  # Avoid modifying the original dataset

    # remove the duplicates from the dataset
    
    def remove_duplicates(self):
        self.df.drop_duplicates(inplace=True)

    # Apply the Label encoding technique for cat.variables
    
    def encode_label(self, columns):
        le = LabelEncoder()
        self.df[columns] = le.fit_transform(self.df[columns])

    # Apply the one-hot encoding technique for cat.variables
    
    def encode_one_hot(self, columns):
        self.df = pd.get_dummies(self.df, columns=columns, drop_first=True)
        
    # Apply Min-Max Normalization to the numerical column
    
    def normalize_columns(self, columns):
        self.df[columns] = (self.df[columns] - self.df[columns].min()) / (self.df[columns].max() - self.df[columns].min())
        
    # Fills missing values in numerical columns with their median
    
    def fill_numerical_median(self):
        numerical_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        for feature in numerical_features:
            median_value = self.df[feature].median()
            self.df[feature].fillna(median_value, inplace=True)
            
    # Fills missing values in categorical columns with their mode
    
    def fill_categorical_mode(self):
        categorical_features = self.df.select_dtypes(include=['object', 'category']).columns
        for feature in categorical_features:
            mode_value = self.df[feature].mode()[0]
            self.df[feature].fillna(mode_value, inplace=True)
            
    # Prints dataset information and preview
    
    def validate_data(self):
        
        print("\nDataset Info:")
        print(self.df.info())
        print("\nFirst 5 Rows:")
        print(self.df.head())
        
    #  Returns the cleaned dataset
    
    def get_cleaned_data(self):
        
        return self.df

# Execute the code with dataset

In [3]:
# Load the dataset
df = pd.read_csv("C:\\Users\\sarav\\Smart_Premium\\Smart_premium_ML\\Research_Data\\train.csv")

# Creating DataCleaning Object
cleaner = DataCleaning(df)

# Applying Data Cleaning Steps
cleaner.remove_duplicates()
cleaner.fill_numerical_median()
cleaner.fill_categorical_mode()
cleaner.validate_data()

# Get the cleaned dataset
cleaned_df = cleaner.get_cleaned_data()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1200000 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1200000 non-null  float64
 4   Marital Status        1200000 non-null  object 
 5   Number of Dependents  1200000 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            1200000 non-null  object 
 8   Health Score          1200000 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       1200000 non-null  float64
 12  Vehicle Age           1200000 non-null  float64
 13  Credit Score          1200000 non-null  float64
 14  Insurance Duration 

In [4]:
cleaned_df

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,595.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,...,1.0,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,1199996,54.0,Male,35786.0,Divorced,2.0,Master's,Self-Employed,11.483482,Rural,...,1.0,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,Employed,14.724469,Suburban,...,0.0,19.0,595.0,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,1199998,55.0,Male,23911.0,Single,1.0,PhD,Employed,18.547381,Suburban,...,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


In [5]:
# Verify if any missing values remain

print(" Missing values after Imputation:\n",cleaned_df.isnull().sum().loc[cleaned_df.isnull().sum() > 0])

 Missing values after Imputation:
 Series([], dtype: int64)


In [6]:
data = pd.DataFrame(cleaned_df)

data

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,595.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,...,1.0,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,1199996,54.0,Male,35786.0,Divorced,2.0,Master's,Self-Employed,11.483482,Rural,...,1.0,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,Employed,14.724469,Suburban,...,0.0,19.0,595.0,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,1199998,55.0,Male,23911.0,Single,1.0,PhD,Employed,18.547381,Suburban,...,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


In [7]:
# Convert DataFrame to CSV

Data_cleaning = data.to_csv('Data_Cleaning.csv', index=False)
