In [None]:
pip install LightFM

Collecting LightFM
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m204.8/316.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: LightFM
  Building wheel for LightFM (setup.py) ... [?25l[?25hdone
  Created wheel for LightFM: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808329 sha256=e4d26497026c5d19b9fef6066d0b6a81227e8d1ea052a89a1d9f843000eeff60
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built LightFM
Installing collected packages: LightFM
Successfully installed LightFM-1.17


In [None]:
from google.colab import files, drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import coo_matrix, csr_matrix
from lightfm import LightFM
from sklearn.feature_extraction.text import TfidfVectorizer

def train_and_recommend_batchwise_with_metadata(input_file, appmeta_file, output_file, rows_per_batch, k=5, num_epochs=10):
    """
    Process the input dataset in batches, train LightFM, and append recommendations.
    Incorporates app metadata into the recommendation pipeline.

    Args:
        input_file (str): Path to the input CSV file (interaction data).
        appmeta_file (str): Path to the app metadata CSV file.
        output_file (str): Path to the output CSV file with recommendations.
        rows_per_batch (int): Number of rows to process per batch.
        k (int): Number of recommendations to generate for each user.
        num_epochs (int): Number of epochs for LightFM training.
    """
    # Load the app metadata
    appmeta_data = pd.read_csv(appmeta_file)

    # Preprocess app metadata
    appmeta_data['price_category'] = appmeta_data['price'].apply(lambda x: 'Free' if x == 'Install' else 'Paid')
    appmeta_data['num_reviews'] = appmeta_data['num_reviews'].replace(',', '', regex=True).astype(float)
    appmeta_data['avg_rating'] = appmeta_data['avg_rating'].fillna(appmeta_data['avg_rating'].mean())

    # Textual embedding for descriptions
    vectorizer = TfidfVectorizer(max_features=500)
    description_embeddings = vectorizer.fit_transform(appmeta_data['description'].fillna(''))

    # One-hot encode categorical features
    appmeta_features = pd.get_dummies(
        appmeta_data[['app_package', 'app_category', 'content_rating', 'price_category']].fillna('Unknown'),
        columns=['app_category', 'content_rating', 'price_category']
    )
    appmeta_features = pd.concat([appmeta_features, pd.DataFrame(description_embeddings.toarray())], axis=1)

    # Load the dataset in chunks
    for chunk in pd.read_csv(input_file, chunksize=rows_per_batch):
        if os.path.exists(output_file):
            processed_uids = set(pd.read_csv(output_file)['uid'])  # Load processed user IDs
        else:
            processed_uids = set()

        chunk = chunk[~chunk['uid'].isin(processed_uids)]
        if chunk.empty:
            print("All rows in this batch are already processed. Moving to the next batch.")
            continue

        # Map users and items to IDs
        chunk['user_id'] = chunk['uid'].astype('category').cat.codes.astype(int)
        chunk['item_id'] = chunk['app_package'].astype('category').cat.codes.astype(int)

        # Create interaction weights
        chunk['interaction_weight'] = chunk['rating'] * (1 + chunk['sentiment_score'])
        chunk['interaction_weight'] = pd.to_numeric(chunk['interaction_weight'], errors='coerce').fillna(0)
        chunk['interaction_weight'] = chunk['interaction_weight'].clip(lower=0)

        # Build interaction matrix
        interaction_matrix = coo_matrix(
            (chunk['interaction_weight'], (chunk['user_id'], chunk['item_id'])),
            shape=(chunk['user_id'].nunique(), chunk['item_id'].nunique())
        )

        # Check if the interaction matrix is empty
        if interaction_matrix.nnz == 0:
            print("No valid interactions in this batch. Skipping...")
            continue

        # Merge app metadata with the current chunk
        merged_chunk = pd.merge(chunk, appmeta_features, on='app_package', how='left')

        # Validate and build item features matrix
        item_features_matrix = csr_matrix(
            merged_chunk.select_dtypes(include=[np.number]).fillna(0).values
        )

        # Train LightFM
        model = LightFM(loss='warp', no_components=20, random_state=42)
        model.fit(interaction_matrix, item_features=item_features_matrix, epochs=num_epochs, num_threads=4)

        # Create mapping from item IDs to app packages
        item_mapping = dict(enumerate(chunk['app_package'].astype('category').cat.categories))

        # Generate recommendations
        def get_recommendations(user_id):
            # Repeat user_id for all items
            user_ids = np.full(interaction_matrix.shape[1], user_id, dtype=np.int32)
            item_ids = np.arange(interaction_matrix.shape[1], dtype=np.int32)

            scores = model.predict(
                user_ids=user_ids,
                item_ids=item_ids,
                item_features=item_features_matrix
            )
            top_items = np.argsort(-scores)[:k]
            return [item_mapping[item] for item in top_items]

        unique_users = chunk['user_id'].unique()
        recommendations = {user: get_recommendations(user) for user in unique_users}

        # Add recommendations to the chunk
        chunk['recommendations'] = chunk['user_id'].map(recommendations)

        # Append results
        mode = 'a' if os.path.exists(output_file) else 'w'
        header = not os.path.exists(output_file)
        chunk.to_csv(output_file, mode=mode, index=False, header=header)

        print(f"Processed and saved {len(chunk)} rows to {output_file}.")

# Main function to run the pipeline
input_data = '/content/drive/MyDrive/DOCUMENTS_COLLEGE/Internships/Samsung_PRISM/Mobile_rec_dataset/game_data_sample_with_sentiment.csv'
appmeta_data = '/content/drive/MyDrive/DOCUMENTS_COLLEGE/Internships/Samsung_PRISM/Mobile_rec_dataset/app_meta.csv'
output_data = '/content/drive/MyDrive/DOCUMENTS_COLLEGE/Internships/Samsung_PRISM/Mobile_rec_dataset/game_data_with_recommendations.csv'
rows_per_batch = 1000

train_and_recommend_batchwise_with_metadata(input_data, appmeta_data, output_data, rows_per_batch, k=5, num_epochs=10)


All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next batch.
All rows in this batch are already processed. Moving to the next