## ECommerce Recommendation System

### Aju Thomas: 48329426
### Devarsh Rajesh Bende : 80060804
### Dhruv Kumar Boothu : 24121668
### Shreyas Subramanya : 41103539

In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import random

# 1. Data Loading (Using pyarrow)
df = pd.read_parquet('/Users/Bobby/Documents/ML Project/test.parquet', engine='pyarrow')  

#2. Display basic information about the dataset
print(df.info())

#Displaying the first 5 rows of the DataFrame:

print("First 5 rows:")
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2781480 entries, 0 to 2781479
Data columns (total 19 columns):
 #   Column        Dtype              
---  ------        -----              
 0   event_time    object             
 1   event_type    object             
 2   product_id    object             
 3   brand         object             
 4   price         object             
 5   user_id       object             
 6   user_session  object             
 7   target        int64              
 8   cat_0         object             
 9   cat_1         object             
 10  cat_2         object             
 11  cat_3         object             
 12  timestamp     datetime64[us, UTC]
 13  ts_hour       int16              
 14  ts_minute     int16              
 15  ts_weekday    int16              
 16  ts_day        int16              
 17  ts_month      int16              
 18  ts_year       int16              
dtypes: datetime64[us, UTC](1), int16(6), int64(1), object(11)
memor

In [2]:
# 2. Sampling
sample_df = df.sample(n=10000, random_state=42)
sample_df.reset_index(drop=True, inplace=True)

#3. Display basic information about the dataset
print(sample_df.info())
#Displaying the first 5 rows of the sampled DataFrame:
print("First 5 rows:")
print(sample_df.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   event_time    10000 non-null  object             
 1   event_type    10000 non-null  object             
 2   product_id    10000 non-null  object             
 3   brand         10000 non-null  object             
 4   price         10000 non-null  object             
 5   user_id       10000 non-null  object             
 6   user_session  10000 non-null  object             
 7   target        10000 non-null  int64              
 8   cat_0         10000 non-null  object             
 9   cat_1         10000 non-null  object             
 10  cat_2         10000 non-null  object             
 11  cat_3         10000 non-null  object             
 12  timestamp     10000 non-null  datetime64[us, UTC]
 13  ts_hour       10000 non-null  int16              
 14  ts_minu

In [3]:
# 4. Data Preprocessing
sample_df['brand'] = sample_df['brand'].astype(str).str.lower()
sample_df['cat_0'] = sample_df['cat_0'].astype(str).str.lower()
sample_df['cat_1'] = sample_df['cat_1'].astype(str).str.lower()
sample_df['cat_2'] = sample_df['cat_2'].astype(str).str.lower()

In [4]:
# Rule-based recommendation engine
# 5. Feature Encoding
# Create separate binarizers for each column
brand_mlb = MultiLabelBinarizer()
cat0_mlb = MultiLabelBinarizer()
cat1_mlb = MultiLabelBinarizer()
cat2_mlb = MultiLabelBinarizer()

# Fit and transform each column separately 
encoded_brand = brand_mlb.fit_transform(sample_df['brand'].values.reshape(-1, 1))
encoded_cat0 = cat0_mlb.fit_transform(sample_df['cat_0'].values.reshape(-1, 1))
encoded_cat1 = cat1_mlb.fit_transform(sample_df['cat_1'].values.reshape(-1, 1))
encoded_cat2 = cat2_mlb.fit_transform(sample_df['cat_2'].values.reshape(-1, 1))

# Create separate DataFrames for each encoded column using the correct binarizer
encoded_brand_df = pd.DataFrame(encoded_brand, columns=[f"brand_{c}" for c in brand_mlb.classes_], index=sample_df.index)
encoded_cat0_df = pd.DataFrame(encoded_cat0, columns=[f"cat_0_{c}" for c in cat0_mlb.classes_], index=sample_df.index)
encoded_cat1_df = pd.DataFrame(encoded_cat1, columns=[f"cat_1_{c}" for c in cat1_mlb.classes_], index=sample_df.index)
encoded_cat2_df = pd.DataFrame(encoded_cat2, columns=[f"cat_2_{c}" for c in cat2_mlb.classes_], index=sample_df.index)

# Combine all encoded features
encoded_df = pd.concat([encoded_brand_df, encoded_cat0_df, encoded_cat1_df, encoded_cat2_df], axis=1)

# Combine with original dataframe 
sample_df = sample_df.join(encoded_df)

# 6. Recommendation Function
def recommend_items(keyword, df=sample_df, num_recs=5):
    relevant_cols = ['brand', 'cat_0', 'cat_1', 'cat_2'] + list(encoded_df.columns)

    # Filter relevant rows based on keyword
    filtered_df = df[df[relevant_cols].apply(lambda row: keyword in row.values, axis=1)]

    recommendations = []
    for brand in filtered_df['brand'].unique():
        for cat1 in filtered_df[filtered_df['brand'] == brand]['cat_1'].unique():
            for cat2 in filtered_df[(filtered_df['brand'] == brand) & (filtered_df['cat_1'] == cat1)]['cat_2'].unique():
                group = filtered_df[(filtered_df['brand'] == brand) & 
                                   (filtered_df['cat_1'] == cat1) & 
                                   (filtered_df['cat_2'] == cat2)]

                # Sample and get relevant columns as dictionaries
                recs = group.sample(min(num_recs, len(group)))[['product_id', 'brand', 'price', 'cat_0', 'cat_1', 'cat_2']].to_dict(orient='records')
                recommendations.append(recs)

    return recommendations[:3]  # Get top 3 recommendations
# 7. User Interaction
keyword = input("Enter your search keyword: ")
recommendations = recommend_items(keyword.lower())

if recommendations:
    for i, rec_list in enumerate(recommendations):
        print(f"\nRecommendation {i+1}:")
        for item in rec_list:
            print(f"  Product ID: {item['product_id']}")
            print(f"  Brand: {item['brand']}")
            print(f"  Price: {item['price']}")
            print(f"  Category 0: {item['cat_0']}")
            print(f"  Category 1: {item['cat_1']}")
            print(f"  Category 2: {item['cat_2']}")
else:
    print("No recommendations found for this keyword.")


Enter your search keyword:  toster



Recommendation 1:
  Product ID: 12202118
  Brand: novatrack
  Price: 100.13
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster
  Product ID: 12202118
  Brand: novatrack
  Price: 100.13
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster

Recommendation 2:
  Product ID: 100197355
  Brand: forward
  Price: 157.02
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster
  Product ID: 100056116
  Brand: forward
  Price: 105.54
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster
  Product ID: 100197355
  Brand: forward
  Price: 157.02
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster
  Product ID: 100056590
  Brand: forward
  Price: 267.7
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster

Recommendation 3:
  Product ID: 12202336
  Brand: na
  Price: 77.22
  Category 0: appliances
  Category 1: kitchen
  Category 2: toster
  Product ID: 100027737
  Brand: na
  Price: 154.44
  Category 0: applianc

In [5]:
# Content-Based Filtering Recommendatoin

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Feature Selection and Encoding (Modify as needed for your specific features)
relevant_cols = ['brand', 'cat_0', 'cat_1', 'cat_2']

# Combine all features into one string per product for TF-IDF
sample_df['content'] = sample_df[relevant_cols].apply(lambda x: ' '.join(x), axis=1)

# 2. TF-IDF Vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(sample_df['content'])

# 3. Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


# 4. Recommendation Function
def recommend_items(keyword, df=sample_df, cosine_sim_matrix=cosine_sim, num_recs=5):
    relevant_cols = ['brand', 'cat_0', 'cat_1', 'cat_2']
    filtered_df = df[df[relevant_cols].apply(lambda row: keyword in row.values, axis=1)]

    recommendations = []
    for idx in filtered_df.index:
        # Get the indices of the most similar items
        sim_scores = list(enumerate(cosine_sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recs+1]  # Exclude the item itself

        # Get the product IDs of the most similar items
        item_indices = [i[0] for i in sim_scores]
        recs = df.iloc[item_indices][['product_id', 'brand', 'price', 'cat_0', 'cat_1', 'cat_2']].to_dict(orient='records')
        recommendations.append(recs)

    # Flatten and de-duplicate recommendations
    flattened_recs = [item for sublist in recommendations for item in sublist]
    seen = set()
    unique_recs = []
    for item in flattened_recs:
        if item['product_id'] not in seen:
            seen.add(item['product_id'])
            unique_recs.append(item)

    return unique_recs[:5]  # Get top 5 unique recommendations

# 5. User Interaction (Example)
keyword = input("Enter your search keyword: ")
recommendations = recommend_items(keyword.lower())

if recommendations:
    for i, rec in enumerate(recommendations):
        print(f"\nRecommendation {i+1}:")
        print(f"  Product ID: {rec['product_id']}")
        print(f"  Brand: {rec['brand']}")
        print(f"  Price: {rec['price']}")
        print(f"  Category 0: {rec['cat_0']}")
        print(f"  Category 1: {rec['cat_1']}")
        print(f"  Category 2: {rec['cat_2']}")
else:
    print("No recommendations found for this keyword.")


Enter your search keyword:  headphone



Recommendation 1:
  Product ID: 100051366
  Brand: hp
  Price: 820.87
  Category 0: electronics
  Category 1: audio
  Category 2: headphone

Recommendation 2:
  Product ID: 100170577
  Brand: hp
  Price: 306.06
  Category 0: electronics
  Category 1: audio
  Category 2: headphone

Recommendation 3:
  Product ID: 100170834
  Brand: hp
  Price: 247.08
  Category 0: electronics
  Category 1: audio
  Category 2: headphone

Recommendation 4:
  Product ID: 100055405
  Brand: asus
  Price: 592.01
  Category 0: electronics
  Category 1: audio
  Category 2: headphone

Recommendation 5:
  Product ID: 100003338
  Brand: asus
  Price: 849.42
  Category 0: electronics
  Category 1: audio
  Category 2: headphone


In [10]:
# Count the occurrences of each user ID for further Recommendation
user_id_counts = sample_df['user_id'].value_counts()

# Get the top 20 user IDs and their counts
top_10_user_ids = user_id_counts.head(20)  # Use head(10) to get the top 10

# Display the top  user IDs and their counts
print("Top 20 User ID Occurrences:")
print(top_10_user_ids)



Top 20 User ID Occurrences:
user_id
592816742    5
616119853    3
561804169    3
640663684    3
640529380    3
635361405    2
576947743    2
612892292    2
543176993    2
635786036    2
512746568    2
615564020    2
525119354    2
596815932    2
567389498    2
525164158    2
575954873    2
637519148    2
601602231    2
536908726    2
Name: count, dtype: int64


In [7]:
#Collaborative Filtering
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# 1. Create User-Item Interaction Matrix
user_item_matrix = sample_df.pivot_table(index='user_id', columns='product_id', values='target', fill_value=0)

# Convert to sparse CSR matrix for efficiency
user_item_sparse = csr_matrix(user_item_matrix.values)

# 2. Matrix Factorization (SVD)
num_factors = 10  # Adjust the number of latent factors as needed
U, sigma, Vt = svds(user_item_sparse, k=num_factors)
sigma = np.diag(sigma)

# 3. Predictions
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# 4. Recommendation Function (Modified)
def recommend_items(user_id, df=predicted_ratings, num_recs=5):
    user_ratings = df.loc[user_id].sort_values(ascending=False)
    
    # Exclude items the user has already interacted with and limit to top 5
    recommendations = user_ratings[user_ratings > 0].head(num_recs)

    # Retrieve product details from the original DataFrame
    product_details = sample_df[sample_df['product_id'].isin(recommendations.index)][['product_id', 'brand', 'price', 'cat_0', 'cat_1', 'cat_2']]
    return product_details.to_dict(orient='records')  # Return the top 5 recommendations

# 5. User Interaction (Example with Top 5 Limit)
# Get user input
user_id = input("Enter user ID: ") 

# Validate user ID and make recommendations
if user_id in user_item_matrix.index:
    recommendations = recommend_items(user_id)  
    
    # Print only top 5 recommendations
    num_recommendations_to_print = min(5, len(recommendations)) 
    
    if num_recommendations_to_print > 0:
        print(f"\nTop {num_recommendations_to_print} Recommendations for User {user_id}:")
        for i, rec in enumerate(recommendations[:num_recommendations_to_print]):
            print(f"\nRecommendation {i+1}:")
            print(f"  Product ID: {rec['product_id']}")
            print(f"  Brand: {rec['brand']}")
            print(f"  Price: {rec['price']}")
            print(f"  Category 0: {rec['cat_0']}")
            print(f"  Category 1: {rec['cat_1']}")
            print(f"  Category 2: {rec['cat_2']}")
    else:
        print(f"No recommendations found for user {user_id}.")
else:
    print(f"User ID {user_id} not found in the dataset.")
    


Enter user ID:  616119853



Top 5 Recommendations for User 616119853:

Recommendation 1:
  Product ID: 100068493
  Brand: samsung
  Price: 310.15
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 2:
  Product ID: 1005115
  Brand: apple
  Price: 925.88
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 3:
  Product ID: 1005212
  Brand: samsung
  Price: 168.32
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 4:
  Product ID: 1005212
  Brand: samsung
  Price: 167.8
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 5:
  Product ID: 1005212
  Brand: samsung
  Price: 169.35
  Category 0: construction
  Category 1: tools
  Category 2: light


In [8]:
# Hybrid Recommendation system

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Content-Based Filtering 
relevant_cols = ['brand', 'cat_0', 'cat_1', 'cat_2']
sample_df['content'] = sample_df[relevant_cols].apply(lambda x: ' '.join(x), axis=1)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(sample_df['content'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 2. Collaborative Filtering (Similar to previous response)
user_item_matrix = sample_df.pivot_table(index='user_id', columns='product_id', values='target', fill_value=0)
user_item_sparse = csr_matrix(user_item_matrix.values)

num_factors = 10
U, sigma, Vt = svds(user_item_sparse, k=num_factors)
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# 3. Hybrid Recommendation Function
def recommend_items_hybrid(user_id, df=sample_df, num_recs=5, content_weight=0.5, collab_weight=0.5):
    user_interactions = sample_df[sample_df['user_id'] == user_id]['product_id']
    
    # Content-based recommendations
    content_recs = []
    for item_id in user_interactions:
        sim_scores = list(enumerate(cosine_sim[df[df['product_id'] == item_id].index[0]]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recs+1]  # Exclude the item itself
        content_recs.extend([(df['product_id'][i], s) for i, s in sim_scores])
    
    # Collaborative filtering recommendations
    collab_recs = predicted_ratings.loc[user_id].sort_values(ascending=False)[:num_recs].index.tolist()

    # Combine and score recommendations
    all_recs = content_recs + [(rec, 1.0) for rec in collab_recs] # Assign collab score of 1.0
    scored_recs = {}
    for item, score in all_recs:
        scored_recs[item] = scored_recs.get(item, 0) + score * content_weight  # Weighted average
        if item in collab_recs:  
            scored_recs[item] += collab_weight  # Add extra weight for collaborative filtering

    # Sort by score and get product details
    top_recs = sorted(scored_recs.items(), key=lambda x: x[1], reverse=True)[:num_recs]
    product_details = sample_df[sample_df['product_id'].isin([rec[0] for rec in top_recs])][['product_id', 'brand', 'price', 'cat_0', 'cat_1', 'cat_2']].to_dict(orient='records')

    return product_details

# 4. User Interaction (Example with Top 5 Limit)
user_id = input("Enter user ID: ")
if user_id in user_item_matrix.index:
    recommendations = recommend_items(user_id)  
    
    # Print only top 5 recommendations
    num_recommendations_to_print = min(5, len(recommendations)) 
    
    if num_recommendations_to_print > 0:
        print(f"\nTop {num_recommendations_to_print} Recommendations for User {user_id}:")
        for i, rec in enumerate(recommendations[:num_recommendations_to_print]):
            print(f"\nRecommendation {i+1}:")
            print(f"  Product ID: {rec['product_id']}")
            print(f"  Brand: {rec['brand']}")
            print(f"  Price: {rec['price']}")
            print(f"  Category 0: {rec['cat_0']}")
            print(f"  Category 1: {rec['cat_1']}")
            print(f"  Category 2: {rec['cat_2']}")
    else:
        print(f"No recommendations found for user {user_id}.")
else:
    print(f"User ID {user_id} not found in the dataset.")


Enter user ID:  543176993



Top 5 Recommendations for User 543176993:

Recommendation 1:
  Product ID: 1005160
  Brand: xiaomi
  Price: 183.51
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 2:
  Product ID: 1002544
  Brand: apple
  Price: 407.07
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 3:
  Product ID: 100068493
  Brand: samsung
  Price: 310.15
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 4:
  Product ID: 1005115
  Brand: apple
  Price: 925.88
  Category 0: construction
  Category 1: tools
  Category 2: light

Recommendation 5:
  Product ID: 1004836
  Brand: samsung
  Price: 208.47
  Category 0: construction
  Category 1: tools
  Category 2: light
