# Step 1. Setup and Path Definitions
- This initial block handles imports and defines absolute paths for all necessary files and directories. This is crucial for making the script portable and easy to manage.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pickle
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

# --- Mount Google Drive ---
# Necessary for running in Google Colab to access stored files.
from google.colab import drive
drive.mount('/content/drive')

# --- Define Absolute Paths ---
# Assumes the script is run from the root of the 'Amazon_Recommender' directory.
BASE_DIR = '/content/drive/MyDrive/Amazon_Recommender'

# Define paths to key directories.
DATA_DIR = os.path.join(BASE_DIR, 'data')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
OUTPUTS_DIR = os.path.join(BASE_DIR, 'outputs')

# Define specific file paths for loading data, models, and saving results.
XGB_FEATURES_FILE = os.path.join(PROCESSED_DATA_DIR, '10_xgb_features_data.csv')
XGB_MODEL_FILE = os.path.join(MODELS_DIR, '10_xgb_ranker.pkl')
NCF_MODEL_FILE = os.path.join(MODELS_DIR, '11_ncf_model.pt')
NCF_MAPPINGS_FILE = os.path.join(MODELS_DIR, '11_ncf_mappings.pkl') # Updated name for consistency
NCF_SCORES_OUTPUT_FILE = os.path.join(OUTPUTS_DIR, '12_ncf_scores.csv')
FINAL_HYBRID_OUTPUT_FILE = os.path.join(OUTPUTS_DIR, '12_final_hybrid_scores.csv')

Mounted at /content/drive


# Step 2. XGBoost Ranking and Top-K Selection
- In this section, we load the dataset containing all features for the XGBoost model. We then load the pre-trained XGBoost classifier to predict a "like" probability for each user-item pair and select the top 5 recommendations for each user based on this score.

In [None]:
# --- Load Dataset with All Features ---
try:
    xgb_features_df = pd.read_csv(XGB_FEATURES_FILE)
    print(f"XGBoost features data loaded successfully. Shape: {xgb_features_df.shape}")
except FileNotFoundError:
    print(f"Error: XGBoost features file not found at {XGB_FEATURES_FILE}")
    xgb_features_df = None

if xgb_features_df is not None:
    # --- Load the Trained XGBoost Model ---
    try:
        with open(XGB_MODEL_FILE, 'rb') as f:
            xgb_ranker_model = pickle.load(f)
        print("Pre-trained XGBoost ranker model loaded.")
    except FileNotFoundError:
        print(f"Error: XGBoost model not found at {XGB_MODEL_FILE}")
        xgb_ranker_model = None

if xgb_ranker_model is not None:
    # --- Define Features and Target for Prediction ---
    # These are the same features the model was trained on.
    feature_columns = [
        'svd_rating', 'sentiment_score', 'bert_similarity',
        'user_ave_rating', 'product_ave_rating'
    ]
    X_features = xgb_features_df[feature_columns]

    # --- Predict 'Like' Probability on the Full Dataset ---
    # Use predict_proba to get the probability of the positive class (class 1).
    xgb_features_df['xgb_pred_score'] = xgb_ranker_model.predict_proba(X_features)[:, 1]
    print("Generated prediction scores with XGBoost model.")

    # --- Get Top-K Recommendations Per User ---
    TOP_K = 5
    top_k_recommendations_df = (
        xgb_features_df.sort_values(['user_id', 'xgb_pred_score'], ascending=[True, False])
        .groupby('user_id')
        .head(TOP_K)
        .reset_index(drop=True)
    )
    print(f"Extracted Top-{TOP_K} recommendations per user based on XGBoost score.")
    display(top_k_recommendations_df.head())

XGBoost features data loaded successfully. Shape: (4832, 8)
Pre-trained XGBoost ranker model loaded.
Generated prediction scores with XGBoost model.
Extracted Top-5 recommendations per user based on XGBoost score.


Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,xgb_pred_score
0,A0273990TGLE0LLF0H0B,B00RKNN322,4.564091,0.510794,0.32077,5.0,4.142857,5.0,2.9e-05
1,A0273990TGLE0LLF0H0B,B0058TUZTO,4.20633,0.437074,0.552906,5.0,4.211838,5.0,1.8e-05
2,A0273990TGLE0LLF0H0B,106171327X,4.960576,0.463607,0.514674,5.0,4.646667,5.0,6e-06
3,A0273990TGLE0LLF0H0B,B00PBOHAR6,4.742276,0.397193,0.492232,5.0,4.373518,5.0,5e-06
4,A0273990TGLE0LLF0H0B,B007JPVYFO,4.786439,0.466295,0.558465,5.0,4.512,5.0,2e-06


# Step 3. NCF Scoring Engine
- Here, we use the pre-trained Neural Collaborative Filtering (NCF) model to generate a predicted rating for every candidate pair. This involves loading the model, the user/item ID mappings, and running a forward pass to get the scores.

In [None]:
# --- Define the NCF Model Class ---
# The class definition must match the one used during training to load the state_dict correctly.
class NCF(nn.Module):
    """Neural Collaborative Filtering (NCF) model."""
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, user_idx, item_idx):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc_layers(x).squeeze()

if xgb_features_df is not None:
    # --- Load User/Item Mappings ---
    try:
        with open(NCF_MAPPINGS_FILE, 'rb') as f:
            user_id_to_idx, item_id_to_idx = pickle.load(f)
        print("NCF user and item mappings loaded.")

        # --- Prepare Data for NCF Prediction ---
        # Map string IDs to integer indices used by the NCF model.
        ncf_input_df = xgb_features_df.copy()
        ncf_input_df['user_idx'] = ncf_input_df['user_id'].map(user_id_to_idx)
        ncf_input_df['item_idx'] = ncf_input_df['asin'].map(item_id_to_idx)

        # Drop rows where user or item IDs were not in the original training mappings.
        ncf_input_df.dropna(subset=['user_idx', 'item_idx'], inplace=True)
        ncf_input_df['user_idx'] = ncf_input_df['user_idx'].astype(int)
        ncf_input_df['item_idx'] = ncf_input_df['item_idx'].astype(int)

        # --- Load Trained NCF Model ---
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        ncf_prediction_model = NCF(num_users=len(user_id_to_idx), num_items=len(item_id_to_idx))
        ncf_prediction_model.load_state_dict(torch.load(NCF_MODEL_FILE, map_location=device))
        ncf_prediction_model.to(device)
        ncf_prediction_model.eval() # Set model to evaluation mode.
        print("Pre-trained NCF model loaded and set to evaluation mode.")

        # --- Generate NCF Predictions ---
        user_tensor = torch.tensor(ncf_input_df['user_idx'].values, dtype=torch.long).to(device)
        item_tensor = torch.tensor(ncf_input_df['item_idx'].values, dtype=torch.long).to(device)

        with torch.no_grad():
            ncf_scores = ncf_prediction_model(user_tensor, item_tensor).cpu().numpy()

        ncf_input_df['ncf_score'] = ncf_scores
        print("Generated prediction scores with NCF model.")

        # --- Save NCF Scores ---
        ncf_scores_df = ncf_input_df[['user_id', 'asin', 'ncf_score']]
        ncf_scores_df.to_csv(NCF_SCORES_OUTPUT_FILE, index=False)
        print(f"NCF scores saved to: {NCF_SCORES_OUTPUT_FILE}")
        display(ncf_scores_df.head())

    except FileNotFoundError as e:
        print(f"Error loading NCF artifacts: {e}. Cannot generate NCF scores.")
        ncf_scores_df = None

NCF user and item mappings loaded.
Pre-trained NCF model loaded and set to evaluation mode.
Generated prediction scores with NCF model.
NCF scores saved to: /content/drive/MyDrive/Amazon_Recommender/outputs/12_ncf_scores.csv


Unnamed: 0,user_id,asin,ncf_score
0,AAP7PPBU72QFM,B00005AXIV,4.981597
1,AAP7PPBU72QFM,B00017IX10,4.928514
2,AAP7PPBU72QFM,B00006B7TL,4.288648
3,AAP7PPBU72QFM,B000RZ8WHG,4.841337
4,AAP7PPBU72QFM,0151004714,4.345903


# Step 4. Final Hybrid Score Aggregation
- Finally, we merge the NCF scores with the top-K recommendations generated by the XGBoost ranker. This creates a final, enriched dataset that includes scores from both models, forming the basis of the hybrid recommendation engine.

In [None]:
if 'top_k_recommendations_df' in locals() and ncf_scores_df is not None:
    # --- Merge XGBoost Top-K with NCF Scores ---
    # We perform a left merge to keep only the top-K candidates from XGBoost
    # and enrich them with the NCF scores.
    final_hybrid_scores_df = pd.merge(
        top_k_recommendations_df,
        ncf_scores_df[['user_id', 'asin', 'ncf_score']],
        on=['user_id', 'asin'],
        how='left'
    )
    print("\nMerged XGBoost Top-K recommendations with NCF scores.")

    # --- Data Normalization ---
    # Normalize all scores to a 0-1 range for consistent comparison and final ranking.
    scaler = MinMaxScaler()
    score_columns = ['svd_rating', 'bert_similarity', 'sentiment_score', 'xgb_pred_score', 'ncf_score']

    # Ensure all score columns exist before trying to normalize.
    columns_to_normalize = [col for col in score_columns if col in final_hybrid_scores_df.columns]

    if columns_to_normalize:
        final_hybrid_scores_df[columns_to_normalize] = scaler.fit_transform(final_hybrid_scores_df[columns_to_normalize])
        print("Normalized all model scores to a 0-1 scale.")

    # --- Save the Final Hybrid Dataset ---
    final_hybrid_scores_df.to_csv(FINAL_HYBRID_OUTPUT_FILE, index=False)
    print(f"Final hybrid scores saved to: {FINAL_HYBRID_OUTPUT_FILE}")
    display(final_hybrid_scores_df.head())
    print("\nFinal hybrid dataset description:")
    display(final_hybrid_scores_df.describe())


Merged XGBoost Top-K recommendations with NCF scores.
Normalized all model scores to a 0-1 scale.
Final hybrid scores saved to: /content/drive/MyDrive/Amazon_Recommender/outputs/12_final_hybrid_scores.csv


Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,xgb_pred_score,ncf_score
0,A0273990TGLE0LLF0H0B,B00RKNN322,0.880023,0.593459,0.30823,5.0,4.142857,5.0,2.9e-05,0.715895
1,A0273990TGLE0LLF0H0B,B0058TUZTO,0.781554,0.528827,0.587506,5.0,4.211838,5.0,1.8e-05,0.828036
2,A0273990TGLE0LLF0H0B,106171327X,0.989149,0.55209,0.54151,5.0,4.646667,5.0,6e-06,0.847062
3,A0273990TGLE0LLF0H0B,B00PBOHAR6,0.929065,0.493864,0.514511,5.0,4.373518,5.0,4e-06,0.750526
4,A0273990TGLE0LLF0H0B,B007JPVYFO,0.941221,0.554446,0.594194,5.0,4.512,5.0,1e-06,0.869379



Final hybrid dataset description:


Unnamed: 0,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,xgb_pred_score,ncf_score
count,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0
mean,0.875271,0.611193,0.318574,4.319439,4.440217,4.652875,0.016804,0.747452
std,0.126294,0.108478,0.110678,0.615802,0.32705,0.842627,0.116196,0.125401
min,0.0,0.0,0.0,1.714286,2.591837,1.0,0.0,0.0
25%,0.828965,0.542987,0.242438,4.0,4.301573,5.0,3.1e-05,0.685314
50%,0.905879,0.613981,0.306977,4.414216,4.510373,5.0,0.000173,0.77595
75%,0.965973,0.669548,0.384436,4.8,4.646667,5.0,0.00082,0.835378
max,1.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0


# Previous Version w/o Annotation

### Step 1: Load the Merged Candidate Table + Normalize Score

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/10_xgb_features_data.csv')

In [None]:
df.head()

Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall
0,AAP7PPBU72QFM,B00005AXIV,5.0,0.611018,0.26415,5.0,4.312139,5.0
1,AAP7PPBU72QFM,B00017IX10,4.923768,0.775869,0.297912,5.0,4.071429,5.0
2,AAP7PPBU72QFM,B00006B7TL,4.819247,0.583688,0.221613,5.0,4.125,5.0
3,AAP7PPBU72QFM,B000RZ8WHG,4.786166,0.506416,0.289472,5.0,4.410526,5.0
4,AAP7PPBU72QFM,0151004714,4.730593,0.94424,0.173447,5.0,4.2,5.0


In [None]:
df.shape

(4832, 8)

In [None]:
df.isnull().sum()

Unnamed: 0,0
user_id,0
asin,0
svd_rating,0
sentiment_score,0
bert_similarity,0
user_ave_rating,0
product_ave_rating,0
target_overall,0


In [None]:
cols_needed = ['user_id', 'asin', 'svd_rating', 'bert_similarity', 'sentiment_score']
df = df[cols_needed]

In [None]:
df.head()

Unnamed: 0,user_id,asin,svd_rating,bert_similarity,sentiment_score
0,AAP7PPBU72QFM,B00005AXIV,5.0,0.26415,0.611018
1,AAP7PPBU72QFM,B00017IX10,4.923768,0.297912,0.775869
2,AAP7PPBU72QFM,B00006B7TL,4.819247,0.221613,0.583688
3,AAP7PPBU72QFM,B000RZ8WHG,4.786166,0.289472,0.506416
4,AAP7PPBU72QFM,0151004714,4.730593,0.173447,0.94424


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['svd_rating', 'bert_similarity', 'sentiment_score']] = scaler.fit_transform(
    df[['svd_rating', 'bert_similarity', 'sentiment_score']])

print(df[['svd_rating', 'bert_similarity', 'sentiment_score']].describe())


        svd_rating  bert_similarity  sentiment_score
count  4832.000000      4832.000000      4832.000000
mean      0.875273         0.318292         0.611150
std       0.126320         0.109829         0.108479
min       0.000000         0.000000         0.000000
25%       0.828900         0.242392         0.542844
50%       0.905942         0.306924         0.613958
75%       0.965994         0.384398         0.669511
max       1.000000         1.000000         1.000000


In [None]:
df.head()

Unnamed: 0,user_id,asin,svd_rating,bert_similarity,sentiment_score
0,AAP7PPBU72QFM,B00005AXIV,1.0,0.240111,0.681328
1,AAP7PPBU72QFM,B00017IX10,0.979018,0.28073,0.825856
2,AAP7PPBU72QFM,B00006B7TL,0.95025,0.188936,0.657367
3,AAP7PPBU72QFM,B000RZ8WHG,0.941145,0.270576,0.589621
4,AAP7PPBU72QFM,0151004714,0.92585,0.13099,0.97347


### Step 2: Load XGBoost Model

In [None]:

import pickle
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# === Step 1: Load Merged Dataset with All Features (From Day 10 Output) === #
df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/10_xgb_features_data.csv')

# === Step 2: Load the Trained XGBoost Model === #
with open('/content/drive/MyDrive/Amazon_Recommender/models/10_xgb_ranker.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

df['like'] = (df['target_overall'] >= 4).astype(int)

# Optional: Check balance
print(df['like'].value_counts())

# === Step 3: Define Features === #
feature_cols = [
    'svd_rating',
    'sentiment_score',
    'bert_similarity',
    'user_ave_rating',
    'product_ave_rating'
]
X = df[feature_cols]
y = df['like']

# === Step 4: Split for Evaluation (Optional) === #
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === Step 5: Handle Imbalance + Train Classifier === #
scale = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

model = XGBClassifier(
    eval_metric='logloss',
    scale_pos_weight=scale,
    random_state=42
)

model.fit(X_train, y_train)

# === Step 6: Evaluate (Optional) === #
val_preds = model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, val_preds)
print(f"Validation AUC: {auc:.4f}")

# === Step 7: Predict on Full Dataset === #
df['xgb_pred_score'] = model.predict_proba(X)[:, 1]

# === Step 8: Get Top-K Recommendations Per User === #
top_k = 5
top_recs = (
    df.sort_values(['user_id', 'xgb_pred_score'], ascending=[True, False])
      .groupby('user_id')
      .head(top_k)
      .reset_index(drop=True)
)

# === Step 9: Add Source Tag === #
top_recs['source'] = 'svd + bert + sentiment + xgboost (classifier)'


like
1    4462
0     370
Name: count, dtype: int64
Validation AUC: 0.8882


In [None]:
top_recs.head()

Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,like,xgb_pred_score,source
0,A0273990TGLE0LLF0H0B,B00RKNN322,4.564091,0.510794,0.32077,5.0,4.142857,5.0,1,0.999352,svd + bert + sentiment + xgboost (classifier)
1,A0273990TGLE0LLF0H0B,B00PBOHAR6,4.742276,0.397193,0.492232,5.0,4.373518,5.0,1,0.998917,svd + bert + sentiment + xgboost (classifier)
2,A0273990TGLE0LLF0H0B,B007JPVYFO,4.786439,0.466295,0.558465,5.0,4.512,5.0,1,0.998792,svd + bert + sentiment + xgboost (classifier)
3,A0273990TGLE0LLF0H0B,106171327X,4.960576,0.463607,0.514674,5.0,4.646667,5.0,1,0.997239,svd + bert + sentiment + xgboost (classifier)
4,A0273990TGLE0LLF0H0B,B0058TUZTO,4.20633,0.437074,0.552906,5.0,4.211838,5.0,1,0.993815,svd + bert + sentiment + xgboost (classifier)


In [None]:
top_recs.describe()

Unnamed: 0,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,like,xgb_pred_score
count,4832.0,4832.0,4832.0,4832.0,4832.0,4832.0,4832.0,4832.0
mean,4.546836,0.530972,0.329134,4.319342,4.440089,4.652939,0.923427,0.882246
std,0.458954,0.123733,0.091291,0.615911,0.327057,0.842734,0.26594,0.268362
min,1.366741,-0.166114,0.064567,1.714286,2.591837,1.0,0.0,0.000601
25%,4.37835,0.453062,0.266045,4.0,4.301311,5.0,1.0,0.945734
50%,4.658262,0.534175,0.319684,4.411765,4.510373,5.0,1.0,0.992878
75%,4.876447,0.59754,0.384081,4.8,4.646667,5.0,1.0,0.999038
max,5.0,0.9745,0.895774,5.0,5.0,5.0,1.0,0.999979


In [None]:
top_recs.shape

(4832, 11)

In [None]:
top_recs['source'].value_counts()

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
svd + bert + sentiment + xgboost (classifier),4832


### PyTorch Code: Generate ncf_score for Your Candidates

In [None]:
import torch
import pandas as pd
import pickle
import torch.nn as nn

# === Step 1: Load Candidate Pairs === #
df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/10_xgb_features_data.csv')

# === Step 2: Load user/item mappings === #
with open('/content/drive/MyDrive/Amazon_Recommender/models/user_item_mappings.pkl', 'rb') as f:
    user2idx, item2idx = pickle.load(f)

# Map user_id and asin to indices
df['user_idx'] = df['user_id'].map(user2idx)
df['item_idx'] = df['asin'].map(item2idx)

# Drop rows with unmapped IDs
df = df.dropna(subset=['user_idx', 'item_idx'])
df['user_idx'] = df['user_idx'].astype(int)
df['item_idx'] = df['item_idx'].astype(int)

# === Step 3: Define the NCF Model === #
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user_idx, item_idx):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc_layers(x).squeeze()

# Instantiate the model with original training sizes
ncf_model = NCF(num_users=len(user2idx), num_items=len(item2idx))
ncf_model.load_state_dict(torch.load(
    '/content/drive/MyDrive/Amazon_Recommender/models/ncf_model.pt', map_location=device))
ncf_model.to(device)
ncf_model.eval()

# === Step 4: Run Predictions === #
user_tensor = torch.tensor(df['user_idx'].values, dtype=torch.long).to(device)
item_tensor = torch.tensor(df['item_idx'].values, dtype=torch.long).to(device)

with torch.no_grad():
    scores = ncf_model(user_tensor, item_tensor).cpu().numpy()

df['ncf_score'] = scores

# === Step 5: Save NCF Scores === #
df[['user_id', 'asin', 'ncf_score']].to_csv(
    '/content/drive/MyDrive/Amazon_Recommender/outputs/ncf_scores.csv', index=False)



✅ Saved NCF scores to ncf_scores.csv


In [None]:
df.head()

Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,user_idx,item_idx,ncf_score
0,AAP7PPBU72QFM,B00005AXIV,5.0,0.611018,0.26415,5.0,4.312139,5.0,0,1102,3.755545
1,AAP7PPBU72QFM,B00017IX10,4.923768,0.775869,0.297912,5.0,4.071429,5.0,0,4694,2.434489
2,AAP7PPBU72QFM,B00006B7TL,4.819247,0.583688,0.221613,5.0,4.125,5.0,0,2003,3.179707
3,AAP7PPBU72QFM,B000RZ8WHG,4.786166,0.506416,0.289472,5.0,4.410526,5.0,0,15853,3.193462
4,AAP7PPBU72QFM,0151004714,4.730593,0.94424,0.173447,5.0,4.2,5.0,0,0,2.210014


In [None]:
merged = df.merge(top_recs[['user_id','asin', 'like','xgb_pred_score']], on=['asin','user_id'], how='left')

In [None]:
merged.head()

Unnamed: 0,user_id,asin,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,user_idx,item_idx,ncf_score,like,xgb_pred_score
0,AAP7PPBU72QFM,B00005AXIV,5.0,0.611018,0.26415,5.0,4.312139,5.0,0,1102,3.755545,1,0.999263
1,AAP7PPBU72QFM,B00017IX10,4.923768,0.775869,0.297912,5.0,4.071429,5.0,0,4694,2.434489,1,0.999804
2,AAP7PPBU72QFM,B00006B7TL,4.819247,0.583688,0.221613,5.0,4.125,5.0,0,2003,3.179707,1,0.999843
3,AAP7PPBU72QFM,B000RZ8WHG,4.786166,0.506416,0.289472,5.0,4.410526,5.0,0,15853,3.193462,1,0.999774
4,AAP7PPBU72QFM,0151004714,4.730593,0.94424,0.173447,5.0,4.2,5.0,0,0,2.210014,1,0.999252


In [None]:
merged.isnull().sum()

Unnamed: 0,0
user_id,0
asin,0
svd_rating,0
sentiment_score,0
bert_similarity,0
user_ave_rating,0
product_ave_rating,0
target_overall,0
user_idx,0
item_idx,0


In [None]:
merged.describe()

Unnamed: 0,svd_rating,sentiment_score,bert_similarity,user_ave_rating,product_ave_rating,target_overall,user_idx,item_idx,ncf_score,like,xgb_pred_score
count,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0,4834.0
mean,4.546826,0.531021,0.329368,4.319439,4.440217,4.652875,498.527513,53944.798511,4.202153,0.923459,0.882294
std,0.458859,0.123731,0.091996,0.615802,0.32705,0.842627,289.613599,42288.095321,0.759817,0.26589,0.268317
min,1.366741,-0.166114,0.064567,1.714286,2.591837,1.0,0.0,0.0,0.541093,0.0,0.000601
25%,4.378585,0.453224,0.266083,4.0,4.301573,5.0,247.0,16626.75,3.827774,1.0,0.94575
50%,4.658033,0.534202,0.319729,4.414216,4.510373,5.0,497.0,48595.0,4.413924,1.0,0.992914
75%,4.87637,0.597582,0.384113,4.8,4.646667,5.0,749.75,86954.75,4.719401,1.0,0.999037
max,5.0,0.9745,0.895774,5.0,5.0,5.0,999.0,159557.0,5.706903,1.0,0.999979


In [None]:
merged.to_csv('/content/drive/MyDrive/Amazon_Recommender/outputs/12_hybrid_combined_classifier_bert_xgboost_ncf.csv', index=False)

