In [34]:
from collections import Counter, defaultdict
import math
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix

In [35]:
def createToken(my_string):
    return re.findall('[\w\-]+', my_string.lower())


def tokenize_movies(movies):

    movies['tokens'] = [createToken(genre) for genre in movies['genres']]

    return movies


def CreateFeature(movies):
    vocabulary = {movie_tokens:index for index, movie_tokens in enumerate(sorted(np.unique(np.concatenate(movies.tokens))))}
    df = defaultdict(int)
    for movie_genre in movies.tokens:
        for genre in vocabulary:
            if genre in movie_genre:
                df[genre]+=1

    all_csr = []
    for index, movie in enumerate(movies.tokens):
        colmn, data, row = [], [], []
        tf = Counter(movie)    
        max_k = tf.most_common(1)[0][1]
        for genre, freq in tf.items():
            if genre in vocabulary:
                colmn.append(vocabulary[genre])
                data.append((freq/max_k)*math.log10(len(movies)/df[genre])) # tf-idf
                X = csr_matrix((np.asarray(data), (np.zeros(shape=(len(data))), np.asarray(colmn))), shape=(1, len(vocabulary)))

        all_csr.append(X)

    movies['features'] = all_csr


    return movies, vocabulary

def Find_CosineSim(a, b):
    a = a.toarray()
    b = b.toarray()
    return (np.dot(a,b.T)) / (np.sqrt(np.sum(np.square(a))) * np.sqrt(np.sum(np.square(b))))



def make_predictions(movies, ratings_train, ratings_test):
    predictions = []
    for test_userid, test_movieid in zip(ratings_test.userId, ratings_test.movieId):
        weight_ratings = []
        weights = []
        target_user_ratings = []
        for index, train_user in ratings_train.loc[ratings_train.userId == test_userid, 'movieId': 'rating'].iterrows():

            cos_sim_weight = Find_CosineSim(movies.loc[movies.movieId == int(train_user.movieId)].features.values[0],
                                        movies.loc[movies.movieId == int(test_movieid)].features.values[0])
            weight_ratings.append(train_user.rating * cos_sim_weight)
            weights.append(cos_sim_weight)
            target_user_ratings.append(train_user.rating)


        if np.count_nonzero(weights) > 0:
            predictions.append(np.sum(weight_ratings)/np.sum(weights))
        else:
            predictions.append(ratings_train.loc[ratings_train.userId == test_userid, 'rating'].mean())

    return np.asarray(predictions)

In [37]:
ratings_train = pd.read_csv('train.csv') 
ratings_test = pd.read_csv('test.csv')
movies = pd.read_csv("movies.csv")
movies = tokenize_movies(movies)
movies, vocabulary = CreateFeature(movies)
predictions = make_predictions(movies, ratings_train, ratings_test)
predicted_ratings = pd.Series(predictions)
ratings_test['predicted_ratings'] = np.array(predicted_ratings)

In [38]:
print(mean_absolute_error(ratings_test['rating'], ratings_test['predicted_ratings']))

0.6580660778302934


In [43]:
rmse = sqrt(mean_squared_error(ratings_test['rating'], ratings_test['predicted_ratings']))
rmse

0.8514172186463351