In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import operator
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Load Ratings Data
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))

# Movie Properties
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

# Load Movie Data
movieDict = {}
with open(r'ml-100k/u.item', encoding="ISO-8859-1") as f:
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = list(map(int, fields[5:25]))
        movieDict[movieID] = (name, np.array(genres), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

# Define Distance Function for KNN
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance

# Get Neighbors Function
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if movie != movieID:
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = [distances[x][0] for x in range(K)]
    return neighbors

# KNN Based Recommendation
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

avgRating /= K

# Prepare Data for XGBoost
movieFeatures = []
movieRatings = []
movieIDs = []

for movieID, values in movieDict.items():
    name, genres, popularity, avgRating = values
    features = list(genres) + [popularity]
    movieFeatures.append(features)
    movieRatings.append(avgRating)
    movieIDs.append(movieID)  

# Convert to numpy array
X = np.array(movieFeatures)
y = np.array(movieRatings)
movieIDs = np.array(movieIDs)  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, movieIDs_train, movieIDs_test = train_test_split(X, y, movieIDs, test_size=0.2, random_state=42)

# Train XGBoost model
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train, y_train)

# Predict with XGBoost
y_pred = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

# Compare XGBoost Predictions with KNN Recommendations
test_movie_id = 1
knn_neighbors = getNeighbors(test_movie_id, K)
xgboost_predictions = xg_reg.predict(X_test)

# Example Comparison: First 10 movie recommendations
print("\nComparing KNN and XGBoost Recommendations:")
for i in range(10):
    knn_movie = movieDict[knn_neighbors[i]][0]
    xgb_movie_id = movieIDs_test[i]  
    xgb_movie = movieDict[xgb_movie_id][0]
    print(f"KNN: {knn_movie} | XGBoost: {xgb_movie}")



FileNotFoundError: [Errno 2] No such file or directory: 'ml-100k/u.data'