In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load the dataset
data = pd.read_csv("C:/Users/Admin/Downloads/user_personalized_features.csv")

In [2]:
data.head()

Unnamed: 0.1,Unnamed: 0,User_ID,Age,Gender,Location,Income,Interests,Last_Login_Days_Ago,Purchase_Frequency,Average_Order_Value,Total_Spending,Product_Category_Preference,Time_Spent_on_Site_Minutes,Pages_Viewed,Newsletter_Subscription
0,0,#1,56,Male,Suburban,38037,Sports,5,7,18,2546,Books,584,38,True
1,1,#2,46,Female,Rural,103986,Technology,15,7,118,320,Electronics,432,40,False
2,2,#3,32,Female,Suburban,101942,Sports,28,1,146,3766,Apparel,306,1,True
3,3,#4,60,Female,Suburban,71612,Fashion,18,3,163,4377,Apparel,527,29,False
4,4,#5,25,Male,Suburban,49725,Travel,2,5,141,4502,Health & Beauty,53,10,True


In [3]:
data.shape

(1000, 15)

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Unnamed: 0                   1000 non-null   int64 
 1   User_ID                      1000 non-null   object
 2   Age                          1000 non-null   int64 
 3   Gender                       1000 non-null   object
 4   Location                     1000 non-null   object
 5   Income                       1000 non-null   int64 
 6   Interests                    1000 non-null   object
 7   Last_Login_Days_Ago          1000 non-null   int64 
 8   Purchase_Frequency           1000 non-null   int64 
 9   Average_Order_Value          1000 non-null   int64 
 10  Total_Spending               1000 non-null   int64 
 11  Product_Category_Preference  1000 non-null   object
 12  Time_Spent_on_Site_Minutes   1000 non-null   int64 
 13  Pages_Viewed                 1000 

In [5]:
#Handle categorical features
label_encoder = LabelEncoder()

In [6]:
# Encode Gender, Location, Interests, Product_Category_Preference
for col in ['Gender', 'Location', 'Interests', 'Product_Category_Preference']:
    data[col] = label_encoder.fit_transform(data[col])

In [7]:
print(data[col])

0      1
1      2
2      0
3      0
4      3
      ..
995    0
996    0
997    1
998    2
999    3
Name: Product_Category_Preference, Length: 1000, dtype: int32


In [8]:
# Normalize numeric columns
scaler = MinMaxScaler()

In [9]:
numeric_cols = ['Age', 'Income', 'Last_Login_Days_Ago', 'Purchase_Frequency', 
                'Average_Order_Value', 'Total_Spending', 'Time_Spent_on_Site_Minutes', 'Pages_Viewed']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

In [10]:
print(data[numeric_cols])

          Age    Income  Last_Login_Days_Ago  Purchase_Frequency  \
0    0.826087  0.137770             0.142857            0.777778   
1    0.608696  0.645867             0.500000            0.777778   
2    0.304348  0.630120             0.964286            0.111111   
3    0.913043  0.396445             0.607143            0.333333   
4    0.152174  0.227819             0.035714            0.555556   
..        ...       ...                  ...                 ...   
995  0.086957  0.647223             0.607143            0.777778   
996  0.478261  0.607476             0.535714            0.333333   
997  0.195652  0.402478             0.250000            0.888889   
998  0.934783  0.305117             0.000000            0.333333   
999  0.021739  0.417247             0.392857            0.777778   

     Average_Order_Value  Total_Spending  Time_Spent_on_Site_Minutes  \
0               0.042328        0.498056                    0.974874   
1               0.571429        0.04256

In [11]:
print(data.head())

   Unnamed: 0 User_ID       Age  Gender  Location    Income  Interests  \
0           0      #1  0.826087       1         1  0.137770          2   
1           1      #2  0.608696       0         0  0.645867          3   
2           2      #3  0.304348       0         1  0.630120          2   
3           3      #4  0.913043       0         1  0.396445          0   
4           4      #5  0.152174       1         1  0.227819          4   

   Last_Login_Days_Ago  Purchase_Frequency  Average_Order_Value  \
0             0.142857            0.777778             0.042328   
1             0.500000            0.777778             0.571429   
2             0.964286            0.111111             0.719577   
3             0.607143            0.333333             0.809524   
4             0.035714            0.555556             0.693122   

   Total_Spending  Product_Category_Preference  Time_Spent_on_Site_Minutes  \
0        0.498056                            1                    0.974874

Content-Based Filtering
We'll recommend products based on user preferences and interests

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [13]:
# Use features for content-based filtering
features = ['Interests', 'Product_Category_Preference', 'Age', 'Gender', 'Income', 
            'Time_Spent_on_Site_Minutes', 'Pages_Viewed']
user_profiles = data[features]

In [14]:
# Compute cosine similarity between users
similarity_matrix = cosine_similarity(user_profiles)

In [15]:
# Function to recommend similar products
def recommend_content_based(user_id, top_n=5):
    user_index = data[data['User_ID'] == user_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[user_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top similar users (excluding self)
    top_users = [data.iloc[x[0]] for x in similarity_scores[1:top_n+1]]
    return pd.DataFrame(top_users)[['User_ID', 'Product_Category_Preference']]

# Example: Recommend for User #1
recommendations = recommend_content_based("#1", top_n=5)
print("Recommendations for User #1:")
print(recommendations)

Recommendations for User #1:
    User_ID  Product_Category_Preference
702    #703                            1
362    #363                            1
781    #782                            1
213    #214                            1
154    #155                            1


Collaborative Filtering
We'll use the purchase frequency and preferences to recommend items.

In [17]:
from sklearn.neighbors import NearestNeighbors

In [18]:
# Create a user-item interaction matrix
interaction_matrix = data.pivot(index='User_ID', columns='Product_Category_Preference', values='Purchase_Frequency').fillna(0)

In [19]:
# Build a KNN model for collaborative filtering
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(interaction_matrix)

In [20]:
# Recommend items for a user
def recommend_collaborative(user_id, top_n=5):
    user_index = interaction_matrix.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(interaction_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=top_n+1)
    
    # Get recommended items
    recommended_users = [interaction_matrix.index[i] for i in indices.flatten()[1:]]
    return recommended_users

# Example: Recommend for User #1
collab_recommendations = recommend_collaborative("#1", top_n=5)
print("Collaborative Recommendations for User #1:")
print(collab_recommendations)

Collaborative Recommendations for User #1:
['#864', '#870', '#865', '#872', '#489']


In [21]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Example: Evaluate using RMSE
true_values = [4, 3, 5]  # Replace with actual ratings or interactions
predicted_values = [3.8, 3.1, 4.9]  # Replace with predicted values
rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
print("RMSE:", rmse)

RMSE: 0.14142135623730953
