# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#!pip install scikit-surprise

In [2]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# Sales Data

## Explore the data

### Duplicates

In [3]:
sales_data = pd.read_csv("E-commerece_sales_data_2024.csv")
sales_data.describe()
sales_data.head()
sales_data.columns

Index(['user id', 'product id', 'Interaction type', 'Time stamp',
       'Unnamed: 4'],
      dtype='object')

In [4]:
sales_data = sales_data.drop(columns=['Unnamed: 4'])

In [5]:
sales_data.head()

Unnamed: 0,user id,product id,Interaction type,Time stamp
0,1.0,4c69b61db1fc16e7013b43fc926e502d,purchase,10/10/2023 8:00
1,2.0,66d49bbed043f5be260fa9f7fbff5957,view,11/10/2023 8:00
2,3.0,2c55cae269aebf53838484b0d7dd931a,like,12/10/2023 8:00
3,4.0,18018b6bc416dab347b1b7db79994afa,view,13/10/2023 8:00
4,5.0,e04b990e95bf73bbe6a3fa09785d7cd0,like,14/10/2023 8:00


In [6]:
sales_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3289     True
3290     True
3291     True
3292     True
3293     True
Length: 3294, dtype: bool

In [7]:
sales_data.duplicated().sum()

294

In [8]:
sales_data = sales_data.drop_duplicates()

In [9]:
len(sales_data)

3000

### Missing data

### Visualizing the data

In [10]:
sales_data[['Date', 'Time']] = sales_data['Time stamp'].str.split(' ', expand=True)
sales_data[['Date', 'Time']]

Unnamed: 0,Date,Time
0,10/10/2023,8:00
1,11/10/2023,8:00
2,12/10/2023,8:00
3,13/10/2023,8:00
4,14/10/2023,8:00
...,...,...
2995,16/12/2023,8:00
2996,17/12/2023,8:00
2997,18/12/2023,8:00
2998,19/12/2023,8:00


In [11]:
sales_data["user id"]= sales_data["user id"].round().astype( int)
sales_data.head()

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# Customer Details Data

## Explore the data

### Duplicates

In [12]:
customer_data = pd.read_csv("customer_details.csv")
customer_data.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [13]:
customer_data.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [14]:
customer_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3895    False
3896    False
3897    False
3898    False
3899    False
Length: 3900, dtype: bool

In [15]:
sales_data.duplicated().sum()

0

### Missing data

In [16]:
customer_data.isnull().sum().sort_values(ascending=False)

Customer ID               0
Age                       0
Payment Method            0
Previous Purchases        0
Promo Code Used           0
Discount Applied          0
Shipping Type             0
Subscription Status       0
Review Rating             0
Season                    0
Color                     0
Size                      0
Location                  0
Purchase Amount (USD)     0
Category                  0
Item Purchased            0
Gender                    0
Frequency of Purchases    0
dtype: int64

In [17]:
customer_data.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [18]:
customer_data.rename(columns={'Customer ID': 'user id'}, inplace=True)
customer_data.head()

Unnamed: 0,user id,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


# Products data

## Explore the data

### Duplicates

In [19]:
products_data = pd.read_csv("product_details.csv")
products_data.shape

(10002, 28)

In [20]:
products_data.columns

Index(['Uniqe Id', 'Product Name', 'Brand Name', 'Asin', 'Category',
       'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity',
       'Model Number', 'About Product', 'Product Specification',
       'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image',
       'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details',
       'Dimensions', 'Color', 'Ingredients', 'Direction To Use',
       'Is Amazon Seller', 'Size Quantity Variant', 'Product Description'],
      dtype='object')

In [21]:

products_data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
9997     False
9998     False
9999     False
10000    False
10001    False
Length: 10002, dtype: bool

In [22]:
products_data.duplicated().sum()

0

### Missing data

In [23]:
products_data.isnull().sum().sort_values(ascending=False)

Product Description      10002
Sku                      10002
Brand Name               10002
Asin                     10002
Size Quantity Variant    10002
List Price               10002
Direction To Use         10002
Quantity                 10002
Ingredients              10002
Color                    10002
Dimensions               10002
Product Details          10002
Stock                    10002
Upc Ean Code              9968
Product Dimensions        9523
Variants                  7524
Model Number              1770
Product Specification     1632
Shipping Weight           1138
Category                   830
Technical Details          790
About Product              273
Selling Price              107
Image                        0
Product Name                 0
Product Url                  0
Is Amazon Seller             0
Uniqe Id                     0
dtype: int64

In [24]:
columns_to_drop = ['Product Description', 'Sku', 'Brand Name', 'Asin',
                   'Size Quantity Variant', 'List Price', 'Direction To Use',
                   'Quantity', 'Ingredients', 'Color', 'Dimensions',
                   'Product Details', 'Stock', "Upc Ean Code", "Product Dimensions", "Variants", "Model Number",
                   "Product Specification", "Shipping Weight"]

products_data.drop(columns=columns_to_drop, inplace=True)

In [25]:
products_data.head()

Unnamed: 0,Uniqe Id,Product Name,Category,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y


In [26]:
columns_to_fill = ['Category', 'Technical Details', 'About Product', 'Selling Price']

products_data[columns_to_fill] = products_data[columns_to_fill].fillna(products_data[columns_to_fill].mode().iloc[0])

In [27]:
products_data.isnull().sum().sort_values(ascending=False)

Uniqe Id             0
Product Name         0
Category             0
Selling Price        0
About Product        0
Technical Details    0
Image                0
Product Url          0
Is Amazon Seller     0
dtype: int64

In [28]:
products_data.columns

Index(['Uniqe Id', 'Product Name', 'Category', 'Selling Price',
       'About Product', 'Technical Details', 'Image', 'Product Url',
       'Is Amazon Seller'],
      dtype='object')

# Merging the datasets :

In [29]:
#products_data['Product_Description'] = products_data['Product Name'] + ' ' + products_data['Technical Details'] + ' ' + products_data['Category']+ ' ' + products_data['About Product']

In [30]:
merged_data = pd.merge(sales_data, customer_data, on='user id', how='inner')
merged_data = pd.merge(merged_data, products_data, left_on='product id', right_on='Uniqe Id', how='left')
merged_data

Unnamed: 0,user id,product id,Interaction type,Time stamp,Date,Time,Age,Gender,Item Purchased,Category_x,...,Frequency of Purchases,Uniqe Id,Product Name,Category_y,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller
0,1.0,4c69b61db1fc16e7013b43fc926e502d,purchase,10/10/2023 8:00,10/10/2023,8:00,55,Male,Blouse,Clothing,...,Fortnightly,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y
1,2.0,66d49bbed043f5be260fa9f7fbff5957,view,11/10/2023 8:00,11/10/2023,8:00,19,Male,Sweater,Clothing,...,Fortnightly,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y
2,3.0,2c55cae269aebf53838484b0d7dd931a,like,12/10/2023 8:00,12/10/2023,8:00,50,Male,Jeans,Clothing,...,Weekly,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y
3,4.0,18018b6bc416dab347b1b7db79994afa,view,13/10/2023 8:00,13/10/2023,8:00,21,Male,Sandals,Footwear,...,Weekly,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y
4,5.0,e04b990e95bf73bbe6a3fa09785d7cd0,like,14/10/2023 8:00,14/10/2023,8:00,45,Male,Blouse,Clothing,...,Annually,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,2995.0,f5149cfb8e04d7b30bd7b4eaed6713b8,like,15/12/2023 8:00,15/12/2023,8:00,69,Female,Sweater,Clothing,...,Quarterly,f5149cfb8e04d7b30bd7b4eaed6713b8,Bandito MT 2.8 1/10 RC Monster Truck Tires wit...,Toys & Games | Hobbies | Remote & App Controll...,$27.31,Make sure this fits by entering your model num...,"Size:Mounted, 1/2"" Offset | Style:Bandito |...",https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Duratrax-Bandito-Monste...,Y
2995,2996.0,82318e8acf79bbeb3cf685a2732fb630,view,16/12/2023 8:00,16/12/2023,8:00,29,Female,Jeans,Clothing,...,Bi-Weekly,82318e8acf79bbeb3cf685a2732fb630,Steiff Baby Teddy & Me Teddy Bear Boy with Paj...,Toys & Games | Stuffed Animals & Plush Toys | ...,$45.95,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Steiff-Baby-Teddy-Pajam...,Y
2996,2997.0,19d150365d798db47cccf1622b6ad754,purchase,17/12/2023 8:00,17/12/2023,8:00,70,Female,Jewelry,Accessories,...,Every 3 Months,19d150365d798db47cccf1622b6ad754,uxcell 1500 A12121200ux0003-10000RPM DC3-12V H...,Toys & Games | Hobbies | Remote & App Controll...,$5.33,Hand Wash in Cold Water. | Line Dry. | No Bleach.,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/a12121200ux0003-10000RP...,Y
2997,2998.0,136c0aff3ab56bd3d138775fe90ef970,view,18/12/2023 8:00,18/12/2023,8:00,41,Female,Sweater,Clothing,...,Every 3 Months,136c0aff3ab56bd3d138775fe90ef970,Zvezda 5023 - 1/72 German King Tiger Ausf B He...,Toys & Games | Hobbies | Models & Model Kits |...,$14.90,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/German-Tiger-Henschel-T...,Y


In [31]:
identical_ids = (merged_data['Uniqe Id'] == merged_data['product id']).all()

if identical_ids:
    print("The 'Uniqe Id' and 'product id' are identical for every row.")
else:
    print("There are discrepancies between 'Uniqe Id' and 'product id'.")

The 'Uniqe Id' and 'product id' are identical for every row.


In [32]:
#Remove the duplicated 'Uniqe Id' column
merged_data.drop(columns='Uniqe Id', inplace=True)
merged_data['Selling Price'] = merged_data['Selling Price'].str.replace('$', '')
merged_data[['Selling Price']]

  merged_data['Selling Price'] = merged_data['Selling Price'].str.replace('$', '')


Unnamed: 0,Selling Price
0,237.68
1,99.95
2,34.99
3,28.91
4,17.49
...,...
2994,27.31
2995,45.95
2996,5.33
2997,14.90


In [33]:
#merged_data = pd.merge(sales_data, customer_data, left_on='user id', right_on='Customer ID', how='left')
#merged_data = pd.merge(merged_data, products_data, left_on='product id', right_on='Uniqe Id', how='left')
#merged_data

In [34]:
import re
import pandas as pd
import numpy as np

# Step 1: Remove strange characters and spaces
def clean_price(price):
    # Check if the value is NaN
    if pd.isna(price):
        return np.nan
    # Remove non-numeric characters and spaces
    price = re.sub(r'[^0-9.]', '', str(price))
    return price

# Clean 'Selling Price' column
merged_data['Selling Price'] = merged_data['Selling Price'].apply(clean_price)

# Step 2: Convert to numeric
merged_data['Selling Price'] = pd.to_numeric(merged_data['Selling Price'], errors='coerce')  # 'coerce' to handle any conversion errors

In [35]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2999 entries, 0 to 2998
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user id                 2999 non-null   float64
 1   product id              2999 non-null   object 
 2   Interaction type        2871 non-null   object 
 3   Time stamp              2999 non-null   object 
 4   Date                    2999 non-null   object 
 5   Time                    2999 non-null   object 
 6   Age                     2999 non-null   int64  
 7   Gender                  2999 non-null   object 
 8   Item Purchased          2999 non-null   object 
 9   Category_x              2999 non-null   object 
 10  Purchase Amount (USD)   2999 non-null   int64  
 11  Location                2999 non-null   object 
 12  Size                    2999 non-null   object 
 13  Color                   2999 non-null   object 
 14  Season                  2999 non-null   

In [36]:
merged_data['Selling Price'].unique

<bound method Series.unique of 0       237.68
1        99.95
2        34.99
3        28.91
4        17.49
         ...  
2994     27.31
2995     45.95
2996      5.33
2997     14.90
2998      9.44
Name: Selling Price, Length: 2999, dtype: float64>

In [37]:
import numpy as np

# Calculate quartiles
Q1 = np.percentile(merged_data['Selling Price'], 25)
Q3 = np.percentile(merged_data['Selling Price'], 75)

# Calculate IQR
IQR = Q3 - Q1

# Define threshold for outliers
threshold = 1.5 * IQR

# Identify outliers
outliers = merged_data['Selling Price'][(merged_data['Selling Price'] < Q1 - threshold) | (merged_data['Selling Price'] > Q3 + threshold)]

print("Outliers:", outliers)

Outliers: Series([], Name: Selling Price, dtype: float64)


In [38]:
products_data.head()

Unnamed: 0,Uniqe Id,Product Name,Category,Selling Price,About Product,Technical Details,Image,Product Url,Is Amazon Seller
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,$237.68,Make sure this fits by entering your model num...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,$99.95,Make sure this fits by entering your model num...,The snap circuits mini kits classpack provides...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Electronic-Circuits-Cla...,Y
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,$34.99,Make sure this fits by entering your model num...,show up to 2 reviews by default No longer are ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/3Doodler-Plastic-Innova...,Y
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,$28.91,Make 8 different Planes at one time. | Experim...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Guillow-Airplane-Design...,Y
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,$17.49,Make sure this fits by entering your model num...,show up to 2 reviews by default 100% Officiall...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Woodstock-Collage-500-p...,Y


# The TF-IDF Matrix :

In [None]:
#products_data["Product_Description"] = products_data["Product_Description"].str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"[\d]", " ", regex=True)

In [39]:
tfidf = TfidfVectorizer(stop_words="english", min_df=4)

In [40]:
tfidf_matrix = tfidf.fit_transform(products_data["Product Name"])

In [41]:
similarity = cosine_similarity(tfidf_matrix,tfidf_matrix)
similarity.shape

(10002, 10002)

In [42]:
index = products_data[products_data["Uniqe Id"] == '1a22f23576bfdfe5ed6c887dc117aab6'].index[0]


In [43]:
feature_names = tfidf.get_feature_names_out()

In [44]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
similarity_scores = pd.DataFrame(similarity[index],
                                  columns=["similarity"])

In [46]:
product_indices = similarity_scores.sort_values("similarity", ascending=False)[1:11].index

In [47]:
similar_product_names = products_data['Product Name'].iloc[product_indices]
similar_product_names

6991    Remedia Publications REM501 Addition Timed Mat...
230     EVAN-MOOR 4545 Skill Sharpeners Math Book, Gra...
8161    American Educational Products 480 eleMENTALS A...
8842    On the Mark Press OTM119 Mapping Skills Activi...
8362    On the Mark Press OTM1132 Multiplication & Div...
4283    Constructive Playthings SVL-468 Gingerbread Ma...
8577    Learning Advantage 4524"Where's Wilma?" Game, ...
7357    Dick Martin Sports MASP12 Parachute with 12 Ha...
7838    American Educational Products 6-740126 Koontz ...
5741    Constructive Playthings TYE-31 Soft Flexible B...
Name: Product Name, dtype: object

# Model-Based Recommender System

## Baseline Model

In [None]:
#df = merged_data[["user id", "product id", "Interaction type", "Product_Description", "Product Name", "Date", "Time", 'Age', 'Gender', 'Location', 'Category_y', 'Selling Price']]

In [48]:
df = merged_data[['Product Name', 'Category_y', 'Selling Price', 'About Product', 'Interaction type', 'Age', 'Gender', 'Location']]

In [49]:
df.head()

Unnamed: 0,Product Name,Category_y,Selling Price,About Product,Interaction type,Age,Gender,Location
0,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,237.68,Make sure this fits by entering your model num...,purchase,55,Male,Kentucky
1,"Electronic Snap Circuits Mini Kits Classpack, ...",Toys & Games | Learning & Education | Science ...,99.95,Make sure this fits by entering your model num...,view,19,Male,Maine
2,3Doodler Create Flexy 3D Printing Filament Ref...,Toys & Games | Arts & Crafts | Craft Kits,34.99,Make sure this fits by entering your model num...,like,50,Male,Massachusetts
3,Guillow Airplane Design Studio with Travel Cas...,Toys & Games | Hobbies | Models & Model Kits |...,28.91,Make 8 different Planes at one time. | Experim...,view,21,Male,Rhode Island
4,Woodstock- Collage 500 pc Puzzle,Toys & Games | Puzzles | Jigsaw Puzzles,17.49,Make sure this fits by entering your model num...,like,45,Male,Oregon


In [50]:
df.columns

Index(['Product Name', 'Category_y', 'Selling Price', 'About Product',
       'Interaction type', 'Age', 'Gender', 'Location'],
      dtype='object')

In [None]:
#import re
#def clean_text(text):
    # Remove strange characters
    #text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
    # Remove extra spaces and dashes
    #text = re.sub(r'\s+', ' ', text)
    #text = re.sub(r'-+', '-', text)
    # Remove leading and trailing spaces
    #text = text.strip()
    #return text

#columns_to_clean = ['Product_Description', 'Gender', 'Location', 'Product Name', 'Interaction type', 'Selling Price']
#for column in columns_to_clean:
    #df.loc[:, column] = df[column].apply(clean_text)

# Assuming your DataFrame is named df
#df['Selling Price'] = df['Selling Price'].apply(lambda x: float(x.split(' - ')[0]) if '-' in x else float(x))


In [51]:
interactions = {'purchase': 5, 'like': 4, 'view': 3}
df['rating'] = df['Interaction type'].map(interactions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = df['Interaction type'].map(interactions)


In [52]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user id', 'product id', 'rating']], reader)

KeyError: "['user id', 'product id'] not in index"

In [None]:
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

In [None]:
accuracy.rmse(predictions)

In [None]:
# svd_model.predict(uid=1, iid=541, verbose=True)

In [None]:
# product_ids = ["4c69b61db1fc16e7013b43fc926e502d", "2c55cae269aebf53838484b0d7dd931a", "f5149cfb8e04d7b30bd7b4eaed6713b8"]

In [None]:
# sample_df = df[df["product id"].isin(product_ids)]

In [None]:
# sample_df.head()

In [None]:
# sample_df[sample_df["user id"] ==1 ]

## Model Tunning

In [None]:
# param_grid = {'n_epochs': [5, 10, 20],
#               'lr_all': [0.002, 0.005, 0.007]}

In [None]:
# gs = GridSearchCV(SVD,
#                   param_grid,
#                   measures=['rmse', 'mae'],
#                   cv=3,
#                   n_jobs=-1,
#                   joblib_verbose=True)

In [None]:
# gs.fit(data)

In [None]:
# gs.best_score['rmse']

In [None]:
# gs.best_params['rmse']

## Predict

In [None]:
# svd_model.n_epochs

In [None]:
# svd_model = SVD(**gs.best_params['rmse'])

In [None]:
# data = data.build_full_trainset()

In [None]:
# svd_model.fit(data)

In [None]:
# svd_model.predict(uid=1.0, iid=541, verbose=True)

In [None]:
# def suggest(df, user_id, sug):
#     didnt_interact = df["product id"][~(df["user id"] == user_id)].drop_duplicates().values.tolist()
#     temp_dict = {}

#     for i in didnt_interact:
#         temp_dict[i] = svd_model.predict(uid=user_id, iid=i)[3]

#     suggestions = pd.DataFrame(temp_dict.items(), columns=["product id", 'possible_rate']).sort_values(by="possible_rate", ascending=False).head(sug)
#     merged = pd.merge(suggestions, df[["product id", "Product Name"]], how="inner", on="product id")

#     return merged

In [None]:
# suggest(df,1,15).sort_values(by="possible_rate", ascending=False)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder

In [None]:
df.rename(columns={'Product Name': 'Product_Name'}, inplace=True)
df.rename(columns={'Selling Price': 'Selling_Price'}, inplace=True)
df.rename(columns={'Interaction type': 'Interaction_type'}, inplace=True)
df.rename(columns={'About Product': 'About_Product'}, inplace=True)


In [None]:
df

In [None]:

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

# Assuming df is your DataFrame

# Splitting data into features and target
X = df.drop(columns='Interaction_type')
y = df['Interaction_type']

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Defining preprocessing for numerical and categorical features
numeric_features = ['Age', 'Selling_Price']
categorical_features = ['Gender', 'Location']
text_feature = 'Product_Name', 'Category_y', 'About_Product'   # Ensure this is consistent with your DataFrame

# Preprocessing for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the ColumnTransformer with updated transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# TF-IDF Vectorizer for text data
tfidf_transformer = TfidfVectorizer()

# Preprocess the non-text features
X_train_preprocessed = preprocessor.fit_transform(X_train.drop(columns=text_feature))
X_test_preprocessed = preprocessor.transform(X_test.drop(columns=text_feature))

# Process text feature separately
text_data_train_tfidf = tfidf_transformer.fit_transform(X_train[text_feature])
text_data_test_tfidf = tfidf_transformer.transform(X_test[text_feature])

# Concatenate the TF-IDF vectorized text feature with the other preprocessed features
X_train_final = hstack([X_train_preprocessed, text_data_train_tfidf])
X_test_final = hstack([X_test_preprocessed, text_data_test_tfidf])




In [None]:
m

In [None]:
X_train_final.shape

In [None]:
X_test_final.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
df

In [None]:
df.drop(['Date', 'Time', 'rating'], axis=1, inplace=True)

# Add 'Category_y' inside 'Product_Description' column
df['Product_Description'] = df['Product_Description'] + ' ' + df['Category_y']

# Rename 'Interaction type' to 'Interaction_type'
df.rename(columns={'Interaction type': 'Interaction_type'}, inplace=True)

In [None]:
df.drop(['user id', 'product id', 'Category_y'], axis=1, inplace=True)

In [None]:
df.rename(columns={'Product Name': 'Product_Name'}, inplace=True)
df.rename(columns={'Selling Price': 'Selling_Price'}, inplace=True)


In [None]:
df

In [None]:
models = [
    ('Random Forest', RandomForestClassifier(), {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [None, 5, 10]
    }),
    ('Gradient Boosting', GradientBoostingClassifier(), {
        'clf__n_estimators': [50, 100, 200],
        'clf__learning_rate': [0.01, 0.1, 0.5]
    }),
    ('Support Vector Machine', SVC(), {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    }),
    ('K-Nearest Neighbors', KNeighborsClassifier(), {
        'clf__n_neighbors': [3, 5, 10],
        'clf__weights': ['uniform', 'distance']
    })
]

In [None]:

results = {}
for model_name, model, param_grid in models:
    print(f"Performing GridSearchCV for {model_name}...")
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', model)
    ])
    gs = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    gs.fit(X_train, y_train)
    results[model_name] = gs

# Display results
for model_name, gs_result in results.items():
    print(f"Best {model_name} Parameters: {gs_result.best_params_}")
    print(f"Best {model_name} Mean Accuracy: {gs_result.best_score_}")


In [None]:
import pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(gs.best_estimator_, f)

In [None]:
!pip install fastapi

In [None]:
pip install --upgrade fastapi uvicorn

In [None]:
!pip install --upgrade typing-extensions

In [None]:
# !pip install tpot

In [None]:
# !pip install --upgrade scikit-learn

In [None]:
# !pip install --upgrade tpot

In [None]:
# from tpot import TPOTClassifier
# from sklearn.pipeline import make_pipeline
# tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT sparse')

# # Fit TPOTClassifier to your data
# tpot.fit(X_train_final, y_train)

# # Evaluate the best pipeline found by TPOT
# print(tpot.score(X_test_final, y_test))

# # Export the best pipeline as a Python script
# # tpot.export('tpot_pipeline.py')

In [None]:
# df.tail()

In [None]:
from fastapi import FastAPI
import pandas as pd
import pickle

app = FastAPI()

# Define a root `/` endpoint
@app.get('/')
def index():
    return {'ok': True}

@app.get("/predict")
def predict(
        Location: object,
        Age: int,
        Gender: object,
        Product_Name: object,
        Selling_Price: float,
        Interaction_type: object,
        Product_Description: object,
    ):
    """
    Make a single course prediction.
    Assumes `pickup_datetime` is provided as a string by the user in "%Y-%m-%d %H:%M:%S" format
    Assumes `pickup_datetime` implicitly refers to the "US/Eastern" timezone (as any user in New York City would naturally write)
    """
    data_dict = {
        'Location': [Location],
        'Age': [Age],
        'Gender': [Gender],
        'Product_Name': [Product_Name],
        'Selling_Price': [Selling_Price],
        'Interaction_type': [Interaction_type],
    }

    X_pred = pd.DataFrame(data_dict, index=[0])

    with open('models/preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)

    preprocessed_data = pd.DataFrame(preprocessor.transform(X_pred))

    with open('models/harp_model.pkl', 'rb') as f:
        model = pickle.load(f)

    # Make prediction using the loaded model
    prediction = model.predict(preprocessed_data)

    # Return the prediction result
    return {'result': prediction[0]}
