In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [131]:
train_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
data_train = pd.read_csv("ml-100k/u1.base", delimiter='\t', names=train_cols)
test_cols = ['user_id', 'movie_id', 'rating', 'timestamp', 'predict_rating', 'difference']
data_test = pd.read_csv("ml-100k/u1.test", delimiter='\t', names=test_cols)
average_movie_train = data_train.groupby('movie_id')['rating'].mean()
average_movie_train_int = average_movie_train.astype(int)
avg = average_movie_train_int.values.mean()
avg = avg.astype(int)

average_movie_train_df = pd.DataFrame({'rating_avg': data_train.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_df = average_movie_train_df.reset_index(drop=True)
user_movies_train_df = pd.DataFrame({'movies_list': data_train.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_df = pd.DataFrame({'movies_list': data_test.groupby('user_id')['movie_id'].apply(list)}).reset_index()

In [134]:
def user_recommendations(user_movies_train_df,average_movie_train_df):
    user_recommendations_avg = {}
    user_recommendations_random = {}
    for index, row in user_movies_train_df.iterrows():
        id = row['user_id']
        movies = row['movies_list']
        rec_list_avg = []
        i = 0
        # Average
        while len(rec_list_avg) < 20:
            must_pop_avg = average_movie_train_df.iloc[i]['movie_id'].astype(int)        
            if must_pop_avg not in movies:
                rec_list_avg.append(must_pop_avg)
            i = i + 1
        user_recommendations_avg[id] = rec_list_avg
        # Random
        rec_list_random = list(average_movie_train_df.sample(n=20)['movie_id'])
        user_recommendations_random[id] = rec_list_random
    return user_recommendations_avg, user_recommendations_random

In [135]:
all_user_recommendations_avg, all_user_recommendations_random = user_recommendations(user_movies_train_df, average_movie_train_df)
# print(all_user_recommendations_avg)
# print(all_user_recommendations_random)

In [136]:
def avg_movie(movie_id, average_movie_train_int):
    if movie_id in average_movie_train_int:
        return average_movie_train_int[movie_id]
    else:
        return None


def calc_mae(data_test, average_movie_train_int):
    for index, row in data_test.iterrows():
        predict_value = avg_movie(row['movie_id'], average_movie_train_int)
        if predict_value is None:
            predict_value = avg
        data_test.at[index, 'predict_rating'] = predict_value    
        real_value = row['rating']
        difference = pd.np.absolute(real_value - predict_value)
        data_test.at[index, 'difference'] = difference 
    numerator = data_test['difference'].sum()
    denominator = data_test.shape[0]
    mae = numerator/denominator    
    return mae

In [137]:
all_mae = calc_mae(data_test, average_movie_train_int)
print(all_mae)

0.94175


In [138]:
def recall_precision(user_movies_test_df, user_recommendations_avg, user_recommendations_random):
    user_recall_avg = []
    user_recall_random = []
    user_precision_avg = []
    user_precision_random = []
    
    for index, row in user_movies_test_df.iterrows():
        id = row['user_id']
        true = row['movies_list']
        pred_avg = user_recommendations_avg[id]
        pred_random = user_recommendations_random[id]
        
        size_true = len(true)
        size_pred_avg = len(pred_avg)    
        size_pred_random = len(pred_random)
        
        common_avg = list(set(true).intersection(pred_avg))
        common_random = list(set(true).intersection(pred_random))
        
        number_common_avg = len(common_avg)
        number_common_random = len(common_random)
        
        recall_avg = number_common_avg / size_true
        user_recall_avg.append(recall_avg)
        recall_random = number_common_random / size_true
        user_recall_random.append(recall_random)
        
        precision_avg = number_common_avg / size_pred_avg
        user_precision_avg.append(precision_avg)
        precision_random = number_common_random / size_pred_random
        user_precision_random.append(precision_random)
    
    recall_for_avg = np.mean(user_recall_avg)
    recall_for_random = np.mean(user_recall_random)
    precision_for_avg = np.mean(user_precision_avg)
    precision_for_random = np.mean(user_precision_random)
    return recall_for_avg, recall_for_random, precision_for_avg,precision_for_random

In [143]:
all_recall_for_avg, all_recall_for_random, all_precision_for_avg, all_precision_for_random = recall_precision(user_movies_test_df, all_user_recommendations_avg, all_user_recommendations_random)
print(all_recall_for_avg)
print(all_recall_for_random)
print(all_precision_for_avg)
print(all_precision_for_random)

0.02233534107339741
0.013040424753171236
0.04586056644880174
0.02821350762527233


In [144]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('ml-100k/u.user', delimiter='|', names=user_cols)
users_ratings_train = data_train.merge(data_user, on='user_id', how='inner')
users_ratings_test = data_test.merge(data_user, on='user_id', how='inner')

In [145]:
data_train_males = users_ratings_train.loc[users_ratings_train['gender'] == 'M']
data_test_males = users_ratings_test.loc[users_ratings_test['gender'] == 'M']
average_movie_train_males = data_train_males.groupby('movie_id')['rating'].mean()
average_movie_train_males_int = average_movie_train_males.astype(int)
avg_males = average_movie_train_males_int.values.mean()
avg_males = avg_males.astype(int)

average_movie_train_males_df = pd.DataFrame({'rating_avg': data_train_males.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_males_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_males_df = average_movie_train_males_df.reset_index(drop=True)
user_movies_train_males_df = pd.DataFrame({'movies_list': data_train_males.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_males_df = pd.DataFrame({'movies_list': data_test_males.groupby('user_id')['movie_id'].apply(list)}).reset_index()



In [146]:
males_recommendations_avg, males_recommendations_random = user_recommendations(user_movies_train_males_df, average_movie_train_males_df)
# print(males_recommendations_avg)
# print(males_recommendations_random)

In [147]:
males_mae = calc_mae(data_test_males, average_movie_train_males_int)
print(males_mae)

0.9192984769565504


In [148]:
males_recall_for_avg, males_recall_for_random, males_precision_for_avg, males_precision_for_random = recall_precision(user_movies_test_males_df, males_recommendations_avg, males_recommendations_random)
print(males_recall_for_avg)
print(males_recall_for_random)
print(males_precision_for_avg)
print(males_precision_for_random)

0.00239061447012157
0.017470085029308986
0.005945121951219513
0.032164634146341464


In [150]:
data_train_females = users_ratings_train.loc[users_ratings_train['gender'] == 'F']
data_test_females = users_ratings_test.loc[users_ratings_test['gender'] == 'F']
average_movie_train_females = data_train_females.groupby('movie_id')['rating'].mean()
average_movie_train_females_int = average_movie_train_females.astype(int)
avg_females = average_movie_train_females_int.values.mean()
avg_females = avg_females.astype(int)

average_movie_train_females_df = pd.DataFrame({'rating_avg': data_train_females.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_females_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_females_df = average_movie_train_females_df.reset_index(drop=True)
user_movies_train_females_df = pd.DataFrame({'movies_list': data_train_females.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_females_df = pd.DataFrame({'movies_list': data_test_females.groupby('user_id')['movie_id'].apply(list)}).reset_index()



In [151]:
females_recommendations_avg, females_recommendations_random = user_recommendations(user_movies_train_females_df, average_movie_train_females_df)
# print(females_recommendations_avg)
# print(females_recommendations_random)

In [152]:
females_mae = calc_mae(data_test_females, average_movie_train_females_int)
print(females_mae)

1.010552451893234


In [153]:
females_recall_for_avg, females_recall_for_random, females_precision_for_avg, females_precision_for_random = recall_precision(user_movies_test_females_df, females_recommendations_avg, females_recommendations_random)
print(females_recall_for_avg)
print(females_recall_for_random)
print(females_precision_for_avg)
print(females_precision_for_random)

0.007551077332771078
0.014308444786193602
0.01908396946564886
0.022900763358778626
