In [1]:
pip install pandas numpy scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
     ---------------------------------------- 0.0/154.4 kB ? eta -:--:--
     -- ------------------------------------- 10.2/154.4 kB ? eta -:--:--
     -- ------------------------------------- 10.2/154.4 kB ? eta -:--:--
     ------- ----------------------------- 30.7/154.4 kB 325.1 kB/s eta 0:00:01
     --------- --------------------------- 41.0/154.4 kB 217.9 kB/s eta 0:00:01
     -------------- ---------------------- 61.4/154.4 kB 297.7 kB/s eta 0:00:01
     ------------------- ----------------- 81.9/154.4 kB 327.3 kB/s eta 0:00:01
     -------------------------- --------- 112.6/154.4 kB 364.4 kB/s eta 0:00:01
     -----------------------------------  153.6/154.4 kB 437.1 kB/s eta 0:00:01
     ------------------------------------ 154.4/154.4 kB 439.6 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel:

In [2]:
import pandas as pd

# Load dataset
file_path = 'E:/AI-Integrated-E-Commerce-Platform/backend/ratings_Beauty.csv'  # Update with the path to your dataset
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())


           UserId   ProductId  Rating   Timestamp
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
3  A1WMRR494NWEWV  0733001998     4.0  1382572800
4  A3IAAVS479H7M7  0737104473     1.0  1274227200


In [4]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Rename columns for consistency
df = df.rename(columns={
    'user_id_column_name': 'UserId',     
    'product_id_column_name': 'ProductId',  
    'rating_column_name': 'Rating'      
})

# Ensure data types are correct
df['UserId'] = df['UserId'].astype(str)
df['ProductId'] = df['ProductId'].astype(str)
df['Rating'] = df['Rating'].astype(float)

# Normalize ratings (optional, if necessary)
# df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

print(df.head())


UserId       0
ProductId    0
Rating       0
Timestamp    0
dtype: int64
           UserId   ProductId  Rating   Timestamp
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
3  A1WMRR494NWEWV  0733001998     4.0  1382572800
4  A3IAAVS479H7M7  0737104473     1.0  1274227200


In [6]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare the dataset for Surprise
reader = Reader(rating_scale=(df['Rating'].min(), df['Rating'].max()))
data = Dataset.load_from_df(df[['UserId', 'ProductId', 'Rating']], reader)

# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
model = SVD()
model.fit(trainset)

# Evaluate the model
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


RMSE: 1.2472
RMSE: 1.2472468761255235
