## Downloading dataset


In [1]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile


def download_and_unzip(url, extract_to='.'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)


download_and_unzip("https://files.grouplens.org/datasets/movielens/ml-100k.zip",
                   extract_to='.')

Next, let's import all of the modules that we'll use in this notebook.

In [2]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn import ensemble as ens
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

## Bulding training and testing datasets

I will use already splitted train and test datasets - ua.base, ua.test

In [3]:
def make_dfs():
  columns_name=['user_id','item_id','rating','timestamp']
  train_df = pd.read_csv("./ml-100k/ua.base",sep="\t",names=columns_name)
  test_df = pd.read_csv("./ml-100k/ua.test",sep="\t",names=columns_name)

  return train_df, test_df

In [4]:
def preproc(train_df, test_df):
    film_columns = ["item_id", "movie title", "release date", "video release date",
              "IMDb URL", "unknown", "Action", "Adventure", "Animation",
              "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
              "Thriller", "War", "Western"]

    # Reading and preprocessing data about movies
    films_df = pd.read_csv("./ml-100k/u.item", sep="|", names=film_columns, encoding='latin-1')
    films_df.drop(["movie title", "release date", "IMDb URL", "unknown", "video release date"], axis = 1, inplace = True)

    # Merging info about movies to datasets
    train_df = pd.merge(train_df, films_df, how='left', left_on='item_id', right_on='item_id')
    test_df = pd.merge(test_df, films_df, how='left', left_on='item_id', right_on='item_id')

    # Reading and preprocessing data about users
    user_columns = ["user_id", "age", "sex", "occupation", "zip_code"]
    user_df = pd.read_csv("./ml-100k/u.user", sep="|", names=user_columns, encoding='latin-1')
    user_df["sex"] = pp.LabelEncoder().fit_transform(user_df["sex"])
    occup_df = pd.read_csv("./ml-100k/u.occupation", sep="\t", names=["jobs"])
    le = pp.LabelEncoder()
    le.fit(occup_df["jobs"])
    user_df["occupation"] = le.transform(user_df["occupation"])
    user_df.drop(["zip_code"], axis = 1, inplace = True)

    # Merging info about users to datasets
    train_df = pd.merge(train_df, user_df, how='left', left_on='user_id', right_on='user_id')
    test_df = pd.merge(test_df, user_df, how='left', left_on='user_id', right_on='user_id')

    # Preprocessing and normalising some fields
    train_df["rating"]  = train_df["rating"] - 1
    train_df["age"] = train_df["age"] / user_df["age"].abs().max()

    # Sparsing occupation info
    encoder = pp.OneHotEncoder(sparse_output=False)
    encoder.fit(train_df[["occupation"]])
    new_feat = encoder.transform(train_df[["occupation"]])
    new_cols = pd.DataFrame(new_feat, columns=encoder.get_feature_names_out(["occupation"]))
    train_df = pd.concat([train_df, new_cols], axis=1)
    train_df.drop("occupation", axis=1, inplace=True)

    new_feat = encoder.transform(test_df[["occupation"]])
    new_cols = pd.DataFrame(new_feat, columns=encoder.get_feature_names_out(["occupation"]))
    test_df = pd.concat([test_df, new_cols], axis=1)
    test_df.drop("occupation", axis=1, inplace=True)

    test_df["rating"]  = test_df["rating"] - 1
    test_df["age"] = test_df["age"] / user_df["age"].abs().max()

    # Getting X, y division
    train_df.drop(["item_id", "user_id", "timestamp"], axis = 1, inplace = True)
    train_y = train_df["rating"].values
    train_x = train_df.drop('rating', axis=1).values

    test_df.drop(["item_id", "user_id", "timestamp"], axis = 1, inplace = True)
    test_y = test_df["rating"].values
    test_x = test_df.drop('rating', axis=1).values

    print(test_df.info())
    return train_x, train_y, test_x, test_y


In [5]:
train_df, test_df = make_dfs()
train_x, train_y, test_x, test_y = preproc(train_df, test_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9430 entries, 0 to 9429
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rating         9430 non-null   int64  
 1   Action         9430 non-null   int64  
 2   Adventure      9430 non-null   int64  
 3   Animation      9430 non-null   int64  
 4   Children's     9430 non-null   int64  
 5   Comedy         9430 non-null   int64  
 6   Crime          9430 non-null   int64  
 7   Documentary    9430 non-null   int64  
 8   Drama          9430 non-null   int64  
 9   Fantasy        9430 non-null   int64  
 10  Film-Noir      9430 non-null   int64  
 11  Horror         9430 non-null   int64  
 12  Musical        9430 non-null   int64  
 13  Mystery        9430 non-null   int64  
 14  Romance        9430 non-null   int64  
 15  Sci-Fi         9430 non-null   int64  
 16  Thriller       9430 non-null   int64  
 17  War            9430 non-null   int64  
 18  Western 

In [6]:
test_user_lines = {} # Getting useful data for recommendation
test_items = []
for i, data in test_df.iterrows():
  test_items.append(data["item_id"])
  if data["user_id"] not in test_user_lines.keys():
    test_user_lines[data["user_id"]] = [i]
  else:
    test_user_lines[data["user_id"]].append(i)

## Using best pipeline I got from tpot:

In [7]:
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
clf = XGBClassifier(MultinomialNB(alpha=10.0, fit_prior=False), learning_rate=0.01, max_depth=9, min_child_weight=13, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
clf.fit(train_x, train_y)
clf.score(test_x, test_y) # Accuracy may not be high, but our main concern is not accuracy



0.3637327677624602

## Evaluating model
As the main metric for evaluating my system I decided to use
ndcg score. I chose it because of non-binary notions of relevance, in our case ratings.

In [8]:
preds = [] # Predictions
for i in range(len(test_y)):
  pred = clf.predict(test_x[i, :].reshape(1, -1))
  preds.append(pred[0])

In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
from sklearn.metrics import mean_squared_error
print("Mean absolute error: ",  mean_absolute_error(test_y, preds))

Mean absolute error:  0.8632025450689289


In [10]:
def find_ndcg(user_id):
  predictions = [[clf.predict(test_x[j, :].reshape(1, -1))[0] for j in test_user_lines[user_id]]]
  real_rating = [[test_y[j] for j in test_user_lines[user_id]]]

  return ndcg_score(real_rating, predictions)

In [11]:
def evaluate():
  ndcg = 0
  total = 0
  for i in test_user_lines.keys():
    total += 1
    ndcg += find_ndcg(i)
  return ndcg / total

In [12]:
print("Mean ndcg score: ", evaluate()) #  As you see, result is not bad

Mean ndcg score:  0.8975866311141109


## Recommendation example:

In [13]:
def recommend_10(user_id): # Since for testing I use ua.test, where each user got
  # exactly 10 ratings, I will recommend user 10 movies based on my predicted ratings
  # of movies
  predictions = [clf.predict(test_x[j, :].reshape(1, -1))[0] for j in test_user_lines[user_id]]
  real_rating = [test_y[j] for j in test_user_lines[user_id]]
  recommendations = [[test_items[i]] for i in test_user_lines[user_id]]
  for i in range(len(predictions)):
    recommendations[i].append(predictions[i])
  recommendations.sort(key=lambda x: x[1], reverse=True)

  ideal_recommendations = [[test_items[i]] for i in test_user_lines[user_id]]
  for i in range(len(real_rating)):
    ideal_recommendations[i].append(real_rating[i])
  ideal_recommendations.sort(key=lambda x: x[1], reverse=True)

  print("My recommendations: ", [i[0] for i in recommendations])
  print("Ideal recommendations: ", [i[0] for i in ideal_recommendations])



In [14]:
recommend_10(8) # Recommend 10 movies for user #8

My recommendations:  [22, 50, 79, 89, 182, 294, 338, 385, 457, 550]
Ideal recommendations:  [22, 50, 182, 79, 89, 338, 294, 550, 385, 457]


In [15]:
# As you see, my recommendations are not far from ideal ones, based on the test data