In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = './dataset/Dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Customer_ID,Name,Age,Annual_Income,Education_Level,Occupation,City,Customer_Reviews,Purchase_Frequency,Product_Category,Target_Variable
0,CUST0001,Customer_1,64.0,77964.61,Bachelor,Engineer,Phoenix,Amazing! Highly recommended. *&,11,,High
1,CUST0002,Customer_2,24.0,68414.99,High School,,Chicago,"Decent quality, but shipping took too long. ##*",17,Books,Low
2,CUST0003,Customer_3,66.0,93824.94,High School,Engineer,Los Angeles,"It's okay, not the best, but does the job. %!",2,Toys,Low
3,CUST0004,Customer_4,31.0,39551.43,PhD,Engineer,Houston,Great product! Would definitely buy again. ~^#,29,,Low
4,CUST0005,Customer_5,19.0,41216.05,Bachelor,Teacher,New York,"Decent quality, but shipping took too long. $#&",16,Clothing,Medium


In [3]:
# Explore the structure of the dataset
data.info()

#only describe the numerical columns
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer_ID         1000 non-null   object 
 1   Name                1000 non-null   object 
 2   Age                 950 non-null    float64
 3   Annual_Income       950 non-null    float64
 4   Education_Level     950 non-null    object 
 5   Occupation          950 non-null    object 
 6   City                1000 non-null   object 
 7   Customer_Reviews    950 non-null    object 
 8   Purchase_Frequency  1000 non-null   int64  
 9   Product_Category    950 non-null    object 
 10  Target_Variable     1000 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 86.1+ KB


Unnamed: 0,Age,Annual_Income,Purchase_Frequency
count,950.0,950.0,1000.0
mean,42.56,70111.414084,15.352
std,15.506825,28647.356266,8.666095
min,18.0,20131.03,1.0
25%,29.0,45670.7475,8.0
50%,41.0,71288.33,16.0
75%,56.0,93790.89,23.0
max,70.0,119922.8,30.0


In [4]:
# Handle missing values
# For numerical columns, use the median
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].apply(lambda x: x.fillna(x.median()))

# For categorical columns, use the mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Verify missing values are handled
data.isnull().sum()

Customer_ID           0
Name                  0
Age                   0
Annual_Income         0
Education_Level       0
Occupation            0
City                  0
Customer_Reviews      0
Purchase_Frequency    0
Product_Category      0
Target_Variable       0
dtype: int64

In [5]:
# Identify duplicate values
duplicates = data.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows if any
data = data[~duplicates]
print(f"Dataset shape after removing duplicates: {data.shape}")

Number of duplicate rows: 0
Dataset shape after removing duplicates: (1000, 11)


In [6]:
# Remove outliers using the IQR method
for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# Verify the dataset shape after removing outliers
print(f"Dataset shape after removing outliers: {data.shape}")

Dataset shape after removing outliers: (1000, 11)


In [7]:
#Clean the data['Customer_Reviews'] column
# Remove any leading or trailing whitespace special characters
data['Customer_Reviews'] = data['Customer_Reviews'].str.strip()
# Remove any special characters
data['Customer_Reviews'] = data['Customer_Reviews'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

#remove trailing spaces
data['Customer_Reviews'] = data['Customer_Reviews'].str.rstrip()




In [8]:
# Save the cleaned dataset to a new CSV file
output_file_path = './dataset/Cleaned_Dataset.csv'
data.to_csv(output_file_path, index=False)
print(f"Cleaned dataset saved to {output_file_path}")
# Display the cleaned dataset
data.head()

Cleaned dataset saved to ./dataset/Cleaned_Dataset.csv


Unnamed: 0,Customer_ID,Name,Age,Annual_Income,Education_Level,Occupation,City,Customer_Reviews,Purchase_Frequency,Product_Category,Target_Variable
0,CUST0001,Customer_1,64.0,77964.61,Bachelor,Engineer,Phoenix,Amazing Highly recommended,11,Home Appliances,High
1,CUST0002,Customer_2,24.0,68414.99,High School,Artist,Chicago,Decent quality but shipping took too long,17,Books,Low
2,CUST0003,Customer_3,66.0,93824.94,High School,Engineer,Los Angeles,Its okay not the best but does the job,2,Toys,Low
3,CUST0004,Customer_4,31.0,39551.43,PhD,Engineer,Houston,Great product Would definitely buy again,29,Home Appliances,Low
4,CUST0005,Customer_5,19.0,41216.05,Bachelor,Teacher,New York,Decent quality but shipping took too long,16,Clothing,Medium
