# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#!pip install scikit-surprise

In [2]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# Sales Data

## Explore the data

### Duplicates

In [3]:
sales_data = pd.read_csv("E-commerece_sales_data_2024.csv")
sales_data.describe()
sales_data.head()
sales_data.columns

Index(['user id', 'product id', 'Interaction type', 'Time stamp',
       'Unnamed: 4'],
      dtype='object')

In [4]:
sales_data = sales_data.drop(columns=['Unnamed: 4'])

In [5]:
sales_data.head()

Unnamed: 0,user id,product id,Interaction type,Time stamp
0,1.0,4c69b61db1fc16e7013b43fc926e502d,purchase,10/10/2023 8:00
1,2.0,66d49bbed043f5be260fa9f7fbff5957,view,11/10/2023 8:00
2,3.0,2c55cae269aebf53838484b0d7dd931a,like,12/10/2023 8:00
3,4.0,18018b6bc416dab347b1b7db79994afa,view,13/10/2023 8:00
4,5.0,e04b990e95bf73bbe6a3fa09785d7cd0,like,14/10/2023 8:00


In [6]:
sales_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3289     True
3290     True
3291     True
3292     True
3293     True
Length: 3294, dtype: bool

In [7]:
sales_data.duplicated().sum()

294

In [8]:
sales_data = sales_data.drop_duplicates()

In [9]:
len(sales_data)

3000

### Missing data

In [10]:
sales_data["user id"]= sales_data["user id"].round().astype( int)
sales_data.head()

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# Customer Details Data

## Explore the data

### Duplicates

In [11]:
customer_data = pd.read_csv("customer_details.csv")
customer_data.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [12]:
customer_data.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [13]:
customer_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3895    False
3896    False
3897    False
3898    False
3899    False
Length: 3900, dtype: bool

In [14]:
sales_data.duplicated().sum()

0

### Missing data

In [15]:
customer_data.isnull().sum().sort_values(ascending=False)

Customer ID               0
Age                       0
Payment Method            0
Previous Purchases        0
Promo Code Used           0
Discount Applied          0
Shipping Type             0
Subscription Status       0
Review Rating             0
Season                    0
Color                     0
Size                      0
Location                  0
Purchase Amount (USD)     0
Category                  0
Item Purchased            0
Gender                    0
Frequency of Purchases    0
dtype: int64

In [16]:
customer_data.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [17]:
customer_data.rename(columns={'Customer ID': 'user id'}, inplace=True)
customer_data.head()

Unnamed: 0,user id,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


# Products data

## Explore the data

### Duplicates

In [18]:
products_data = pd.read_csv("product_details.csv")
products_data.shape

(10002, 28)

In [19]:
products_data.columns

Index(['Uniqe Id', 'Product Name', 'Brand Name', 'Asin', 'Category',
       'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity',
       'Model Number', 'About Product', 'Product Specification',
       'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image',
       'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details',
       'Dimensions', 'Color', 'Ingredients', 'Direction To Use',
       'Is Amazon Seller', 'Size Quantity Variant', 'Product Description'],
      dtype='object')

In [20]:
products_data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
9997     False
9998     False
9999     False
10000    False
10001    False
Length: 10002, dtype: bool

In [21]:
products_data.duplicated().sum()

0

### Missing data

In [24]:
products_data.isnull().sum().sort_values(ascending=False)

Product Description      10002
Sku                      10002
Brand Name               10002
Asin                     10002
Size Quantity Variant    10002
List Price               10002
Direction To Use         10002
Quantity                 10002
Ingredients              10002
Color                    10002
Dimensions               10002
Product Details          10002
Stock                    10002
Upc Ean Code              9968
Product Dimensions        9523
Variants                  7524
Model Number              1770
Product Specification     1632
Shipping Weight           1138
Category                   830
Technical Details          790
About Product              273
Selling Price              107
Image                        0
Product Name                 0
Product Url                  0
Is Amazon Seller             0
Uniqe Id                     0
dtype: int64

In [22]:
columns_to_drop = ['Product Description', 'Sku', 'Brand Name', 'Asin',
                   'Size Quantity Variant', 'List Price', 'Direction To Use',
                   'Quantity', 'Ingredients', 'Color', 'Dimensions',
                   'Product Details', 'Stock', "Upc Ean Code", "Product Dimensions", "Variants", "Model Number",
                   "Product Specification", "Shipping Weight"]

products_data.drop(columns=columns_to_drop, inplace=True)

In [23]:
products_data.head()

Unnamed: 0,Uniqe Id,Product Name,Category,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y


In [25]:
products_data.columns

Index(['Uniqe Id', 'Product Name', 'Category', 'Selling Price',
       'About Product', 'Technical Details', 'Image', 'Product Url',
       'Is Amazon Seller'],
      dtype='object')

# Merging the datasets :

In [26]:
products_data['Product_Description'] = products_data['Product Name'] + ' ' + products_data['Technical Details'] + ' ' + products_data['Category']+ ' ' + products_data['About Product']

In [27]:
merged_data = pd.merge(sales_data, customer_data, on='user id', how='inner')
merged_data = pd.merge(merged_data, products_data, left_on='product id', right_on='Uniqe Id', how='left')
merged_data

Unnamed: 0,user id,product id,Interaction type,Time stamp,Age,Gender,Item Purchased,Category_x,Purchase Amount (USD),Location,...,Uniqe Id,Product Name,Category_y,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller,Product_Description
0,1.0,4c69b61db1fc16e7013b43fc926e502d,purchase,10/10/2023 8:00,55,Male,Blouse,Clothing,53,Kentucky,...,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y,
1,2.0,66d49bbed043f5be260fa9f7fbff5957,view,11/10/2023 8:00,19,Male,Sweater,Clothing,64,Maine,...,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y,"Electronic Snap Circuits Mini Kits Classpack, ..."
2,3.0,2c55cae269aebf53838484b0d7dd931a,like,12/10/2023 8:00,50,Male,Jeans,Clothing,73,Massachusetts,...,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y,3Doodler Create Flexy 3D Printing Filament Ref...
3,4.0,18018b6bc416dab347b1b7db79994afa,view,13/10/2023 8:00,21,Male,Sandals,Footwear,90,Rhode Island,...,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y,Guillow Airplane Design Studio with Travel Cas...
4,5.0,e04b990e95bf73bbe6a3fa09785d7cd0,like,14/10/2023 8:00,45,Male,Blouse,Clothing,49,Oregon,...,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y,Woodstock- Collage 500 pc Puzzle show up to 2 ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,2995.0,f5149cfb8e04d7b30bd7b4eaed6713b8,like,15/12/2023 8:00,69,Female,Sweater,Clothing,100,Nevada,...,f5149cfb8e04d7b30bd7b4eaed6713b8,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...,Toys & Games | Hobbies | Remote & App Controll...,$27.31,Make sure this fits by entering your model num...,"Size:Mounted, 1/2"" Offset | Style:Bandito |...",https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Duratrax-Bandito-Monste...,Y,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...
2995,2996.0,82318e8acf79bbeb3cf685a2732fb630,view,16/12/2023 8:00,29,Female,Jeans,Clothing,96,Virginia,...,82318e8acf79bbeb3cf685a2732fb630,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...,Toys & Games | Stuffed Animals & Plush Toys | ...,$45.95,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Steiff-Baby-Teddy-Pajam...,Y,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...
2996,2997.0,19d150365d798db47cccf1622b6ad754,purchase,17/12/2023 8:00,70,Female,Jewelry,Accessories,91,Utah,...,19d150365d798db47cccf1622b6ad754,uxcell 1500 A12121200ux0003-10000RPM DC3-12V H...,Toys & Games | Hobbies | Remote & App Controll...,$5.33,,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/a12121200ux0003-10000RP...,Y,
2997,2998.0,136c0aff3ab56bd3d138775fe90ef970,view,18/12/2023 8:00,41,Female,Sweater,Clothing,40,Idaho,...,136c0aff3ab56bd3d138775fe90ef970,Zvezda 5023 - 1/72 German King Tiger Ausf B He...,Toys & Games | Hobbies | Models & Model Kits |...,$14.90,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/German-Tiger-Henschel-T...,Y,Zvezda 5023 - 1/72 German King Tiger Ausf B He...


In [28]:
identical_ids = (merged_data['Uniqe Id'] == merged_data['product id']).all()

if identical_ids:
    print("The 'Uniqe Id' and 'product id' are identical for every row.")
else:
    print("There are discrepancies between 'Uniqe Id' and 'product id'.")

The 'Uniqe Id' and 'product id' are identical for every row.


In [29]:
#Remove the duplicated 'Uniqe Id' column
merged_data.drop(columns='Uniqe Id', inplace=True)
merged_data['Selling Price'] = merged_data['Selling Price'].str.replace('$', '')
merged_data[['Selling Price']]

  merged_data['Selling Price'] = merged_data['Selling Price'].str.replace('$', '')


Unnamed: 0,Selling Price
0,237.68
1,99.95
2,34.99
3,28.91
4,17.49
...,...
2994,27.31
2995,45.95
2996,5.33
2997,14.90


In [30]:
import re
import pandas as pd
import numpy as np

# Step 1: Remove strange characters and spaces
def clean_price(price):
    # Check if the value is NaN
    if pd.isna(price):
        return np.nan
    # Remove non-numeric characters and spaces
    price = re.sub(r'[^0-9.]', '', str(price))
    return price

# Clean 'Selling Price' column
merged_data['Selling Price'] = merged_data['Selling Price'].apply(clean_price)

# Step 2: Convert to numeric
merged_data['Selling Price'] = pd.to_numeric(merged_data['Selling Price'], errors='coerce')  # 'coerce' to handle any conversion errors

In [31]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2999 entries, 0 to 2998
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user id                 2999 non-null   float64
 1   product id              2999 non-null   object 
 2   Interaction type        2871 non-null   object 
 3   Time stamp              2999 non-null   object 
 4   Age                     2999 non-null   int64  
 5   Gender                  2999 non-null   object 
 6   Item Purchased          2999 non-null   object 
 7   Category_x              2999 non-null   object 
 8   Purchase Amount (USD)   2999 non-null   int64  
 9   Location                2999 non-null   object 
 10  Size                    2999 non-null   object 
 11  Color                   2999 non-null   object 
 12  Season                  2999 non-null   object 
 13  Review Rating           2999 non-null   float64
 14  Subscription Status     2999 non-null   

In [32]:
merged_data['Selling Price'].unique

<bound method Series.unique of 0       237.68
1        99.95
2        34.99
3        28.91
4        17.49
         ...  
2994     27.31
2995     45.95
2996      5.33
2997     14.90
2998      9.44
Name: Selling Price, Length: 2999, dtype: float64>

In [33]:
import numpy as np

# Calculate quartiles
Q1 = np.percentile(merged_data['Selling Price'], 25)
Q3 = np.percentile(merged_data['Selling Price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Define threshold for outliers
threshold = 1.5 * IQR

# Identify outliers
outliers = merged_data['Selling Price'][(merged_data['Selling Price'] < Q1 - threshold) | (merged_data['Selling Price'] > Q3 + threshold)]

print("Outliers:", outliers)

Outliers: Series([], Name: Selling Price, dtype: float64)


In [34]:
products_data.head()

Unnamed: 0,Uniqe Id,Product Name,Category,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller,Product_Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y,
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y,"Electronic Snap Circuits Mini Kits Classpack, ..."
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y,3Doodler Create Flexy 3D Printing Filament Ref...
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y,Guillow Airplane Design Studio with Travel Cas...
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y,Woodstock- Collage 500 pc Puzzle show up to 2 ...


In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder

In [36]:
df = merged_data[['Location','Age','Gender','Product Name','Selling Price','About Product','Technical Details','Category_y','Interaction type']]
df

Unnamed: 0,Location,Age,Gender,Product Name,Selling Price,About Product,Technical Details,Category_y,Interaction type
0,Kentucky,55,Male,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",237.68,Make sure this fits by entering your model num...,,Sports & Outdoors | Outdoor Recreation | Skate...,purchase
1,Maine,19,Male,"Electronic Snap Circuits Mini Kits Classpack, ...",99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,Toys & Games | Learning & Education | Science ...,view
2,Massachusetts,50,Male,3Doodler Create Flexy 3D Printing Filament Ref...,34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,Toys & Games | Arts & Crafts | Craft Kits,like
3,Rhode Island,21,Male,Guillow Airplane Design Studio with Travel Cas...,28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,Toys & Games | Hobbies | Models & Model Kits |...,view
4,Oregon,45,Male,Woodstock- Collage 500 pc Puzzle,17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,Toys & Games | Puzzles | Jigsaw Puzzles,like
...,...,...,...,...,...,...,...,...,...
2994,Nevada,69,Female,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...,27.31,Make sure this fits by entering your model num...,"Size:Mounted, 1/2"" Offset | Style:Bandito |...",Toys & Games | Hobbies | Remote & App Controll...,like
2995,Virginia,29,Female,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...,45.95,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,Toys & Games | Stuffed Animals & Plush Toys | ...,view
2996,Utah,70,Female,uxcell 1500 A12121200ux0003-10000RPM DC3-12V H...,5.33,,,Toys & Games | Hobbies | Remote & App Controll...,purchase
2997,Idaho,41,Female,Zvezda 5023 - 1/72 German King Tiger Ausf B He...,14.90,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,Toys & Games | Hobbies | Models & Model Kits |...,view


In [37]:
df['Product_Description'] = df['About Product'] + ' ' + df['Technical Details'] + ' ' + df['Category_y']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Product_Description'] = df['About Product'] + ' ' + df['Technical Details'] + ' ' + df['Category_y']


In [38]:
df

Unnamed: 0,Location,Age,Gender,Product Name,Selling Price,About Product,Technical Details,Category_y,Interaction type,Product_Description
0,Kentucky,55,Male,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",237.68,Make sure this fits by entering your model num...,,Sports & Outdoors | Outdoor Recreation | Skate...,purchase,
1,Maine,19,Male,"Electronic Snap Circuits Mini Kits Classpack, ...",99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,Toys & Games | Learning & Education | Science ...,view,Make sure this fits by entering your model num...
2,Massachusetts,50,Male,3Doodler Create Flexy 3D Printing Filament Ref...,34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,Toys & Games | Arts & Crafts | Craft Kits,like,Make sure this fits by entering your model num...
3,Rhode Island,21,Male,Guillow Airplane Design Studio with Travel Cas...,28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,Toys & Games | Hobbies | Models & Model Kits |...,view,Make 8 different Planes at one time. | Experim...
4,Oregon,45,Male,Woodstock- Collage 500 pc Puzzle,17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,Toys & Games | Puzzles | Jigsaw Puzzles,like,Make sure this fits by entering your model num...
...,...,...,...,...,...,...,...,...,...,...
2994,Nevada,69,Female,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...,27.31,Make sure this fits by entering your model num...,"Size:Mounted, 1/2"" Offset | Style:Bandito |...",Toys & Games | Hobbies | Remote & App Controll...,like,Make sure this fits by entering your model num...
2995,Virginia,29,Female,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...,45.95,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,Toys & Games | Stuffed Animals & Plush Toys | ...,view,Make sure this fits by entering your model num...
2996,Utah,70,Female,uxcell 1500 A12121200ux0003-10000RPM DC3-12V H...,5.33,,,Toys & Games | Hobbies | Remote & App Controll...,purchase,
2997,Idaho,41,Female,Zvezda 5023 - 1/72 German King Tiger Ausf B He...,14.90,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,Toys & Games | Hobbies | Models & Model Kits |...,view,Make sure this fits by entering your model num...


In [39]:
df.drop(columns=['About Product', 'Technical Details', 'Category_y'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['About Product', 'Technical Details', 'Category_y'], inplace=True)


In [40]:
df

Unnamed: 0,Location,Age,Gender,Product Name,Selling Price,Interaction type,Product_Description
0,Kentucky,55,Male,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",237.68,purchase,
1,Maine,19,Male,"Electronic Snap Circuits Mini Kits Classpack, ...",99.95,view,Make sure this fits by entering your model num...
2,Massachusetts,50,Male,3Doodler Create Flexy 3D Printing Filament Ref...,34.99,like,Make sure this fits by entering your model num...
3,Rhode Island,21,Male,Guillow Airplane Design Studio with Travel Cas...,28.91,view,Make 8 different Planes at one time. | Experim...
4,Oregon,45,Male,Woodstock- Collage 500 pc Puzzle,17.49,like,Make sure this fits by entering your model num...
...,...,...,...,...,...,...,...
2994,Nevada,69,Female,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...,27.31,like,Make sure this fits by entering your model num...
2995,Virginia,29,Female,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...,45.95,view,Make sure this fits by entering your model num...
2996,Utah,70,Female,uxcell 1500 A12121200ux0003-10000RPM DC3-12V H...,5.33,purchase,
2997,Idaho,41,Female,Zvezda 5023 - 1/72 German King Tiger Ausf B He...,14.90,view,Make sure this fits by entering your model num...


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd


X = df[['Location', 'Age', 'Gender', 'Product Name', 'Selling Price', 'Product_Description']]
y = df['Interaction type']  

categorical_features = ['Gender', 'Location', 'Product Name']
numeric_features = ['Age', 'Selling Price']

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#preprocessor = ColumnTransformer(
 ##   transformers=[
#        ('num', numeric_transformer, numeric_features),
 #       ('cat', categorical_transformer, categorical_features)
 #   ])

#X_preprocessed = preprocessor.fit_transform(X)


In [52]:
pca_transformer = PCA(n_components=2)

# Include PCA in the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('pca', pca_transformer, numeric_features + categorical_features)
    ])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

# Fit KMeans on preprocessed data
kmeans = KMeans(n_clusters=3, random_state=42)  
kmeans.fit(X_preprocessed)
cluster_labels = kmeans.labels_

# Map cluster labels to interaction types
cluster_mapping = {0: 'view', 1: 'purchase', 2: 'like'}  
y_pred = pd.Series(cluster_labels).map(cluster_mapping)

# Calculate accuracy
accuracy = accuracy_score(y_pred, y)

ValueError: could not convert string to float: 'Male'

In [48]:
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()

# y_true_numeric = label_encoder.fit_transform(y)


In [49]:
# kmeans = KMeans(n_clusters=3, random_state=42)  
# kmeans.fit(X_preprocessed)

In [50]:
# cluster_labels = kmeans.labels_

# cluster_mapping = {0: 'view', 1: 'purchase', 2: 'like'}  

# y_pred = pd.Series(cluster_labels).map(cluster_mapping)

In [51]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y)

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
# X = df[['Location', 'Age', 'Gender', 'Product Name', 'Selling Price', 'Interaction type', 'Product_Description']]
# y = df['Interaction type']


# categorical_features = ['Gender', 'Location', 'Product Name']
# numeric_features = ['Age', 'Selling Price']

# numeric_transformer = Pipeline(steps=[
#     ('imputer', KNNImputer(n_neighbors=5)),
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# target_transformer = Pipeline(steps=[
#     ('encoder', LabelEncoder())
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features),
#         ('target', target_transformer, ['Interaction type'])
#     ])

# X_preprocessed = preprocessor.fit_transform(X)

In [None]:

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

# Assuming df is your DataFrame

# Splitting data into features and target
X = df.drop(columns='Interaction type')
y = df['Interaction type']

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Defining preprocessing for numerical and categorical features
numeric_features = ['Age', 'Selling Price']
categorical_features = ['Gender', 'Location', 'Product Name']
text_feature = 'Product_Description'  # Ensure this is consistent with your DataFrame

# Preprocessing for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the ColumnTransformer with updated transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# TF-IDF Vectorizer for text data
tfidf_transformer = TfidfVectorizer()

# Preprocess the non-text features
X_train_preprocessed = preprocessor.fit_transform(X_train.drop(columns=text_feature))
X_test_preprocessed = preprocessor.transform(X_test.drop(columns=text_feature))

# Process text feature separately
text_data_train_tfidf = tfidf_transformer.fit_transform(X_train[text_feature])
text_data_test_tfidf = tfidf_transformer.transform(X_test[text_feature])

# Concatenate the TF-IDF vectorized text feature with the other preprocessed features
X_train_final = hstack([X_train_preprocessed, text_data_train_tfidf])
X_test_final = hstack([X_test_preprocessed, text_data_test_tfidf])


