In [None]:
!pip install lightfm
import pandas as pd
import numpy as np
import os
from scipy.sparse import coo_matrix, csr_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import files, drive



In [None]:
# mounting google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
test_sample_with_recommendations = pd.read_csv('/content/drive/MyDrive/DOCUMENTS_COLLEGE/Internships/Samsung_PRISM/Mobile_rec_dataset/testing_sample-1.csv')

In [None]:
print(test_sample_with_recommendations.shape)
print(test_sample_with_recommendations.columns)
print(test_sample_with_recommendations.head())

(15003, 10)
Index(['app_package', 'review', 'rating', 'votes', 'date', 'uid',
       'formated_date', 'unix_timestamp', 'app_category', 'sentiment_score'],
      dtype='object')
                       app_package  \
0            com.cleverapps.heroes   
1  com.outfit7.mytalkingangelafree   
2      com.playhardlab.herofactory   
3          com.jumpgames.RealSteel   
4            com.szckhd.jwgly.azyw   

                                              review  rating  votes  \
0                             It's really a fun game       5      1   
1  I really love your updated version. it's. bett...       5    718   
2           Very good but the fary is a bit annoying       4      0   
3                 Bad game ever I want my money back       1      0   
4                                  Problem resolved.       4      2   

               date               uid formated_date  unix_timestamp  \
0  October 21, 2018  shqoc6X1fcJRLEmx    21-10-2018      1540094400   
1    August 3, 2019  sml

In [None]:
# Step 2: Preprocess the dataset
# Assign unique numeric IDs to users and items
test_sample_with_recommendations['user_id'] = test_sample_with_recommendations['uid'].astype('category').cat.codes
test_sample_with_recommendations['item_id'] = test_sample_with_recommendations['app_package'].astype('category').cat.codes

# Create the interaction weight (interactions) based only on the review sentiment
test_sample_with_recommendations['interaction'] = test_sample_with_recommendations['sentiment_score'].clip(lower=0)  # Positive sentiments only

# Build the interaction matrix (user-item matrix)
interaction_matrix = coo_matrix(
    (test_sample_with_recommendations['interaction'],
     (test_sample_with_recommendations['user_id'], test_sample_with_recommendations['item_id']))
)

In [None]:
# Step 3: Train LightFM model
# Initialize LightFM with the logistic loss function
model = LightFM(loss='logistic', random_state=42)

# Train the model on the interaction matrix
model.fit(interaction_matrix, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7b9e05d159c0>

In [None]:
# Step 4: Generate recommendations
# Map item IDs back to app names
item_mapping = dict(enumerate(test_sample_with_recommendations['app_package'].astype('category').cat.categories))
user_mapping = dict(enumerate(test_sample_with_recommendations['uid'].astype('category').cat.categories))

# Function to get top-K recommendations for a user
def get_recommendations(user_id, k=5):
    # Predict scores for all items for a given user
    scores = model.predict(
        user_ids=np.full(interaction_matrix.shape[1], user_id),  # Create an array of the user_id
        item_ids=np.arange(interaction_matrix.shape[1])         # All item IDs
    )
    # Get top-K item IDs
    top_items = np.argsort(-scores)[:k]
    # Map item IDs to app names
    return [item_mapping[item] for item in top_items]

# Generate recommendations for all users
unique_users = test_sample_with_recommendations['user_id'].unique()
recommendations = {user: get_recommendations(user, k=5) for user in unique_users}

# Add recommendations as a new column to the dataset
test_sample_with_recommendations['recommendations'] = test_sample_with_recommendations['user_id'].map(recommendations)

# Save the dataset with recommendations
output_path = '/content/drive/MyDrive/DOCUMENTS_COLLEGE/Internships/Samsung_PRISM/Mobile_rec_dataset/testing_sample_with_recommendations.csv'
test_sample_with_recommendations.to_csv(output_path, index=False)

print(test_sample_with_recommendations[['uid', 'app_package', 'recommendations']].head())


                uid                      app_package  \
0  shqoc6X1fcJRLEmx            com.cleverapps.heroes   
1  smlNgCSD1z66dtpP  com.outfit7.mytalkingangelafree   
2  smnwlsh9CjgHk8Ul      com.playhardlab.herofactory   
3  snCCzP0FvzSb0p8A          com.jumpgames.RealSteel   
4  snCCzP0FvzSb0p8A            com.szckhd.jwgly.azyw   

                                     recommendations  
0  [com.bigcool.puzzle.solitairegenies, com.linkd...  
1  [com.bigcool.puzzle.solitairegenies, com.linkd...  
2  [com.bigcool.puzzle.solitairegenies, com.linkd...  
3  [com.bigcool.puzzle.solitairegenies, com.linkd...  
4  [com.bigcool.puzzle.solitairegenies, com.linkd...  


In [None]:
# Step 5: Evaluation Metrics
# Define evaluation functions for Hit@K and NDCG@K
def hit_at_k(test_data, recommendations, k=5):
    hits = 0
    total_users = len(test_data['user_id'].unique())

    for user_id, group in test_data.groupby('user_id'):
        ground_truth = group['item_id'].values
        top_k_recommendations = [test_sample_with_recommendations[test_sample_with_recommendations['app_package'] == app]['item_id'].values[0]
                                 for app in recommendations[user_id][:k]
                                 if app in test_sample_with_recommendations['app_package'].values]

        if any(item in top_k_recommendations for item in ground_truth):
            hits += 1

    return hits / total_users

def ndcg_at_k(test_data, recommendations, k=5):
    ndcg = 0
    total_users = len(test_data['user_id'].unique())

    for user_id, group in test_data.groupby('user_id'):
        ground_truth = group['item_id'].values
        top_k_recommendations = [test_sample_with_recommendations[test_sample_with_recommendations['app_package'] == app]['item_id'].values[0]
                                 for app in recommendations[user_id][:k]
                                 if app in test_sample_with_recommendations['app_package'].values]

        # Compute DCG
        dcg = 0
        for rank, item in enumerate(top_k_recommendations):
            if item in ground_truth:
                dcg += 1 / np.log2(rank + 2)

        # Compute IDCG
        idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, len(ground_truth))))

        # Normalize DCG
        ndcg += dcg / idcg if idcg > 0 else 0

    return ndcg / total_users

# Evaluate the model
hit5 = hit_at_k(test_sample_with_recommendations, recommendations, k=5)
hit10 = hit_at_k(test_sample_with_recommendations, recommendations, k=10)
ndcg5 = ndcg_at_k(test_sample_with_recommendations, recommendations, k=5)
ndcg10 = ndcg_at_k(test_sample_with_recommendations, recommendations, k=10)

In [None]:
print(f"Hit@5: {hit5}")
print(f"Hit@10: {hit10}")
print(f"NDCG@5: {ndcg5}")
print(f"NDCG@10: {ndcg10}")

Hit@5: 0.032701660238135165
Hit@10: 0.032701660238135165
NDCG@5: 0.011237904070615092
NDCG@10: 0.011176401928033725
