In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load the dataset
file_path = './dataset/Cleaned_Dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Customer_ID,Name,Age,Annual_Income,Education_Level,Occupation,City,Customer_Reviews,Purchase_Frequency,Product_Category,Target_Variable
0,CUST0001,Customer_1,64.0,77964.61,Bachelor,Engineer,Phoenix,Amazing Highly recommended,11,Home Appliances,High
1,CUST0002,Customer_2,24.0,68414.99,High School,Artist,Chicago,Decent quality but shipping took too long,17,Books,Low
2,CUST0003,Customer_3,66.0,93824.94,High School,Engineer,Los Angeles,Its okay not the best but does the job,2,Toys,Low
3,CUST0004,Customer_4,31.0,39551.43,PhD,Engineer,Houston,Great product Would definitely buy again,29,Home Appliances,Low
4,CUST0005,Customer_5,19.0,41216.05,Bachelor,Teacher,New York,Decent quality but shipping took too long,16,Clothing,Medium


### Feature Engineering

In [30]:
data = data.drop(['Customer_ID','Name'],axis=1)

<b>Justification</b>: As customer_id and name doen't contribute to anything to target varible.

In [31]:
# Normalize and standardize numerical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
# Numerical Columns are 'Age', 'Annual_Income', 'Purchase_Frequency'

# Standardize numerical columns using StandardScaler
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Display the updated dataset
data.head()

Unnamed: 0,Age,Annual_Income,Education_Level,Occupation,City,Customer_Reviews,Purchase_Frequency,Product_Category,Target_Variable
0,1.424085,0.279283,Bachelor,Engineer,Phoenix,Amazing Highly recommended,-0.502438,Home Appliances,High
1,-1.223159,-0.062894,High School,Artist,Chicago,Decent quality but shipping took too long,0.190262,Books,Low
2,1.556447,0.847581,High School,Engineer,Los Angeles,Its okay not the best but does the job,-1.541488,Toys,Low
3,-0.759891,-1.097116,PhD,Engineer,Houston,Great product Would definitely buy again,1.575661,Home Appliances,Low
4,-1.554065,-1.03747,Bachelor,Teacher,New York,Decent quality but shipping took too long,0.074812,Clothing,Medium


<b>Justification</b>: As customer_id and name doen't contribute to anything to target varible.

In [32]:
# Encode categorical variables
# Apply ordinal encoding to Education_Level
ordinal_encoder = LabelEncoder()
data['Education_Level'] = ordinal_encoder.fit_transform(data['Education_Level'].fillna('Unknown'))

# Apply one-hot encoding to City, Occupation, Product_Category
categorical_cols = ['City', 'Occupation', 'Product_Category']
one_hot_encoder = OneHotEncoder( drop='first')
encoded_features = one_hot_encoder.fit_transform(data[categorical_cols].fillna('Unknown'))
# Convert sparse matrix to DataFrame
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out(categorical_cols))
# Display the encoded features
encoded_df.head()


# Drop original categorical columns and concatenate encoded features
data = data.drop(categorical_cols, axis=1)
data = pd.concat([data, encoded_df], axis=1)

# Display the updated dataset
data.head()

Unnamed: 0,Age,Annual_Income,Education_Level,Customer_Reviews,Purchase_Frequency,Target_Variable,City_Houston,City_Los Angeles,City_New York,City_Phoenix,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Scientist,Occupation_Teacher,Product_Category_Clothing,Product_Category_Electronics,Product_Category_Home Appliances,Product_Category_Toys
0,1.424085,0.279283,0,Amazing Highly recommended,-0.502438,High,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.223159,-0.062894,1,Decent quality but shipping took too long,0.190262,Low,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.556447,0.847581,1,Its okay not the best but does the job,-1.541488,Low,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.759891,-1.097116,3,Great product Would definitely buy again,1.575661,Low,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.554065,-1.03747,0,Decent quality but shipping took too long,0.074812,Medium,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [33]:
# Extract features from Customer_Reviews using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=100,stop_words='english')
# What is the purpose of max_features=100?
# max_features=100 limits the number of features to the top 100 based on TF-IDF scores.
# What is the purpose of stop_words='english'?
# stop_words='english' removes common English stop words from the analysis.
tfidf_features = tfidf_vectorizer.fit_transform(data['Customer_Reviews'].fillna('')).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f"TFIDF_{word}" for word in tfidf_feature_names])
data = pd.concat([data, tfidf_df], axis=1)

data = data.drop(['Customer_Reviews'], axis=1)

data.head()

Unnamed: 0,Age,Annual_Income,Education_Level,Purchase_Frequency,Target_Variable,City_Houston,City_Los Angeles,City_New York,City_Phoenix,Occupation_Doctor,...,TFIDF_product,TFIDF_purchase,TFIDF_quality,TFIDF_recommended,TFIDF_service,TFIDF_shipping,TFIDF_shopping,TFIDF_terrible,TFIDF_took,TFIDF_worst
0,1.424085,0.279283,0,-0.502438,High,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.223159,-0.062894,1,0.190262,Low,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0
2,1.556447,0.847581,1,-1.541488,Low,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.759891,-1.097116,3,1.575661,Low,1.0,0.0,0.0,0.0,0.0,...,0.413863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.554065,-1.03747,0,0.074812,Medium,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0


### Dimensionality Reduction using PCA

In [34]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data.drop(columns=['Target_Variable']))
pca_df = pd.DataFrame()
pca_df['PCA1'] = pca_result[:, 0]
pca_df['PCA2'] = pca_result[:, 1]

pca_df.head()


Unnamed: 0,PCA1,PCA2
0,-0.929067,-0.388588
1,-0.835004,0.3457
2,-0.050297,0.727317
3,1.214745,-1.075572
4,-2.061733,-0.219367
