In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [53]:
train_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
data_train = pd.read_csv("ml-100k/u1.base", delimiter='\t', names=train_cols)
test_cols = ['user_id', 'movie_id', 'rating', 'timestamp', 'predict_rating', 'difference']
data_test = pd.read_csv("ml-100k/u1.test", delimiter='\t', names=test_cols)
average_movie_train = data_train.groupby('movie_id')['rating'].mean()
average_movie_train_int = average_movie_train.astype(int)
avg = average_movie_train_int.values.mean()
avg = avg.astype(int)

average_movie_train_df = pd.DataFrame({'rating_avg': data_train.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_df = average_movie_train_df.reset_index(drop=True)
user_movies_train_df = pd.DataFrame({'movies_list': data_train.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_df = pd.DataFrame({'movies_list': data_test.groupby('user_id')['movie_id'].apply(list)}).reset_index()

i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
          'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
data_i = pd.read_csv('ml-100k/u.item', delimiter='|', names=i_cols, encoding='latin-1')
average_movie_train_df_new = data_i.merge(average_movie_train_df, on='movie_id', how='inner')


In [54]:
def avg_movie(movie_id, average_movie_train_int):
    if movie_id in average_movie_train_int:
        return average_movie_train_int[movie_id]
    else:
        return None


def calc_mae(data_test, average_movie_train_int, avg):
    for index, row in data_test.iterrows():
        predict_value = avg_movie(row['movie_id'], average_movie_train_int)
        if predict_value is None:
            predict_value = avg
        data_test.at[index, 'predict_rating'] = predict_value    
        real_value = row['rating']
        difference = pd.np.absolute(real_value - predict_value)
        data_test.at[index, 'difference'] = difference 
    numerator = data_test['difference'].sum()
    denominator = data_test.shape[0]
    mae = numerator/denominator    
    return mae

In [55]:
all_mae = calc_mae(data_test, average_movie_train_int, avg)
print("The average MAE of the predicted rating compared with the actual rating for the test group: ")
print(all_mae)

The average MAE of the predicted rating compared with the actual rating for the test group: 
0.94175


In [56]:
def user_recommendations(user_movies_train_df, average_movie_train_df):
    user_recommendations_avg = {}
    user_recommendations_random = {}
    user_recommendations_avg_title = {}
    user_recommendations_random_title = {}
    for index, row in user_movies_train_df.iterrows():
        id = row['user_id']
        movies = row['movies_list']
        rec_list_avg_title = []
        rec_list_avg = []
        i = 0
        # Average
        while len(rec_list_avg_title) < 20:
            must_pop_avg = average_movie_train_df.iloc[i]['movie_id'].astype(int) 
            must_pop_avg_title = average_movie_train_df.iloc[i]['movie_title'] 
            if must_pop_avg not in movies:
                rec_list_avg_title.append(must_pop_avg_title)
                rec_list_avg.append(must_pop_avg)
            i = i + 1
        user_recommendations_avg[id] = rec_list_avg
        user_recommendations_avg_title[id] = rec_list_avg_title
        # Random
        list_random = average_movie_train_df.sample(n=20)
        rec_list_random = list(list_random['movie_id'])
        rec_list_random_title = list(list_random['movie_title'])
        user_recommendations_random[id] = rec_list_random
        user_recommendations_random_title[id] = rec_list_random_title
    return user_recommendations_avg_title, user_recommendations_random_title, user_recommendations_avg, user_recommendations_random

In [57]:
all_user_recommendations_avg_title, all_user_recommendations_random_title, all_user_recommendations_avg, all_user_recommendations_random = user_recommendations(user_movies_train_df, average_movie_train_df_new)
print("Recommendations for users - average: ")
for i, rec in all_user_recommendations_avg_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")
print("")
print("Recommendations for users - random: ")
for i, rec in all_user_recommendations_random_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")

Recommendations for users - average: 
For the 1 user, the recommended movies are:
['Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Richard III (1995)', 'Usual Suspects, The (1995)', 'Postino, Il (1994)', 'From Dusk Till Dawn (1996)', 'Angels and Insects (1995)', 'Taxi Driver (1976)', 'Rumble in the Bronx (1995)', 'Bad Boys (1995)', 'Crimson Tide (1995)', 'Desperado (1995)', 'Mad Love (1995)', 'Strange Days (1995)', 'Dolores Claiborne (1994)', 'Ed Wood (1994)', 'I.Q. (1994)', 'Legends of the Fall (1994)', 'Natural Born Killers (1994)', 'Outbreak (1995)', 'Pulp Fiction (1994)']

For the 2 user, the recommended movies are:
['GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'From Dusk 

For the 757 user, the recommended movies are:
['Four Rooms (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', 'Postino, Il (1994)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'White Balloon, The (1995)', "Antonia's Line (1995)", 'Angels and Insects (1995)', 'Muppet Treasure Island (1996)', 'Taxi Driver (1976)', 'Birdcage, The (1996)', 'Brothers McMullen, The (1995)', 'Belle de jour (1967)', 'Crumb (1994)']

For the 758 user, the recommended movies are:
['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', 'Copycat (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'From Dusk Till Dawn (1996)', 'White Balloon, The (1995)', "Antonia's Line (1995)", 'Muppet Treasure Island (1996)', 'Braveheart (1995)', 'Bad Boys (1995)', 'Belle de jou


For the 480 user, the recommended movies are:
['S.F.W. (1994)', 'Fierce Creatures (1997)', 'Desert Winds (1995)', 'Heavyweights (1994)', 'Mrs. Doubtfire (1993)', 'Cronos (1992)', 'Men of Means (1998)', 'Chairman of the Board (1998)', 'Deep Rising (1998)', 'Timecop (1994)', 'Reckless (1995)', 'Fallen (1998)', 'Devil in a Blue Dress (1995)', 'Alphaville (1965)', 'Seven Years in Tibet (1997)', 'Malice (1993)', 'Another Stakeout (1993)', 'Secret Adventures of Tom Thumb, The (1993)', 'Twisted (1996)', 'Search for One-eye Jimmy, The (1996)']

For the 481 user, the recommended movies are:
['Wings of Desire (1987)', 'Miami Rhapsody (1995)', 'Vermont Is For Lovers (1992)', 'Ruling Class, The (1972)', 'Metro (1997)', 'Funeral, The (1996)', 'Short Cuts (1993)', 'Getting Even with Dad (1994)', 'Father of the Bride (1950)', 'Double Team (1997)', 'Feeling Minnesota (1996)', 'Terminator, The (1984)', 'Boys (1996)', 'Mad City (1997)', 'Cabin Boy (1994)', 'Twilight (1998)', 'Manhattan (1979)', 'Little

In [58]:
def recall_precision(user_movies_test_df, user_recommendations_avg, user_recommendations_random):
    user_recall_avg = []
    user_recall_random = []
    user_precision_avg = []
    user_precision_random = []
    
    for index, row in user_movies_test_df.iterrows():
        id = row['user_id']
        true = row['movies_list']
        pred_avg = user_recommendations_avg[id]
        pred_random = user_recommendations_random[id]
        
        size_true = len(true)
        size_pred_avg = len(pred_avg)    
        size_pred_random = len(pred_random)
        
        common_avg = list(set(true).intersection(pred_avg))
        common_random = list(set(true).intersection(pred_random))
        
        number_common_avg = len(common_avg)
        number_common_random = len(common_random)
        
        recall_avg = number_common_avg / size_true
        user_recall_avg.append(recall_avg)
        recall_random = number_common_random / size_true
        user_recall_random.append(recall_random)
        
        precision_avg = number_common_avg / size_pred_avg
        user_precision_avg.append(precision_avg)
        precision_random = number_common_random / size_pred_random
        user_precision_random.append(precision_random)
    
    recall_for_avg = np.mean(user_recall_avg)
    recall_for_random = np.mean(user_recall_random)
    precision_for_avg = np.mean(user_precision_avg)
    precision_for_random = np.mean(user_precision_random)
    return recall_for_avg, recall_for_random, precision_for_avg, precision_for_random

In [59]:
all_recall_for_avg, all_recall_for_random, all_precision_for_avg, all_precision_for_random = recall_precision(user_movies_test_df, all_user_recommendations_avg, all_user_recommendations_random)
print(all_recall_for_avg)
print(all_recall_for_random)
print(all_precision_for_avg)
print(all_precision_for_random)

0.04307390214168824
0.010938641238346772
0.09324618736383442
0.025490196078431372


In [60]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('ml-100k/u.user', delimiter='|', names=user_cols)
users_ratings_train = data_train.merge(data_user, on='user_id', how='inner')
users_ratings_test = data_test.merge(data_user, on='user_id', how='inner')

In [61]:
data_train_males = users_ratings_train.loc[users_ratings_train['gender'] == 'M']
data_test_males = users_ratings_test.loc[users_ratings_test['gender'] == 'M']
average_movie_train_males = data_train_males.groupby('movie_id')['rating'].mean()
average_movie_train_males_int = average_movie_train_males.astype(int)
avg_males = average_movie_train_males_int.values.mean()
avg_males = avg_males.astype(int)

average_movie_train_males_df = pd.DataFrame({'rating_avg': data_train_males.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_males_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_males_df = average_movie_train_males_df.reset_index(drop=True)
user_movies_train_males_df = pd.DataFrame({'movies_list': data_train_males.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_males_df = pd.DataFrame({'movies_list': data_test_males.groupby('user_id')['movie_id'].apply(list)}).reset_index()


average_movie_train_males_df_new = data_i.merge(average_movie_train_males_df, on='movie_id', how='inner')

In [62]:
males_recommendations_avg_title, males_recommendations_random_title, males_recommendations_avg, males_recommendations_random = user_recommendations(user_movies_train_males_df, average_movie_train_males_df_new)
print("Recommendations for males - average: ")
for i, rec in males_recommendations_avg_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")
print("")
print("Recommendations for males - random: ")
for i, rec in males_recommendations_random_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")

Recommendations for males - average: 
For the 1 user, the recommended movies are:
['Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Richard III (1995)', 'Usual Suspects, The (1995)', 'Postino, Il (1994)', 'From Dusk Till Dawn (1996)', 'Angels and Insects (1995)', 'Taxi Driver (1976)', 'Rumble in the Bronx (1995)', 'Bad Boys (1995)', 'Crimson Tide (1995)', 'Desperado (1995)', 'Mad Love (1995)', 'Strange Days (1995)', 'Dolores Claiborne (1994)', 'Ed Wood (1994)', 'I.Q. (1994)', 'Legends of the Fall (1994)', 'Natural Born Killers (1994)', 'Outbreak (1995)', 'Pulp Fiction (1994)']

For the 3 user, the recommended movies are:
['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', 'Postino, Il (1994)', "Mr. Holland's



For the 33 user, the recommended movies are:
['Shine (1996)', "What's Love Got to Do with It (1993)", 'Absolute Power (1997)', 'Big Squeeze, The (1996)', 'Entertaining Angels: The Dorothy Day Story (1996)', 'Stranger in the House (1997)', 'Bogus (1996)', 'Dangerous Beauty (1998)', 'Mother Night (1996)', 'Burnt Offerings (1976)', 'True Lies (1994)', 'From Dusk Till Dawn (1996)', 'Glimmer Man, The (1996)', 'Talking About Sex (1994)', 'Fresh (1994)', 'Promesse, La (1996)', 'Dances with Wolves (1990)', '2 Days in the Valley (1996)', 'Casper (1995)', 'Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)']

For the 37 user, the recommended movies are:
['Absolute Power (1997)', 'Children of the Corn: The Gathering (1996)', 'Flubber (1997)', 'Letter From Death Row, A (1998)', 'Twisted (1996)', 'Supercop (1992)', 'Crumb (1994)', 'Richard III (1995)', 'Waiting for Guffman (1996)', 'Walk in the Clouds, A (1995)', 'Foxfire (1996)', 'Shallow Grave (1994)', 'Jungle Boo

['My Crazy Life (Mi vida loca) (1993)', 'Beauty and the Beast (1991)', 'Love in the Afternoon (1957)', 'First Kid (1996)', 'Gabbeh (1996)', 'Dingo (1992)', "I Can't Sleep (J'ai pas sommeil) (1994)", 'Exotica (1994)', "Romy and Michele's High School Reunion (1997)", 'Brother Minister: The Assassination of Malcolm X (1994)', 'King of the Hill (1993)', 'Crumb (1994)', 'Basic Instinct (1992)', 'It Happened One Night (1934)', 'Picture Bride (1995)', 'Year of the Horse (1997)', 'Just Cause (1995)', 'Death in Brunswick (1991)', 'Nowhere (1997)', 'Fish Called Wanda, A (1988)']

For the 737 user, the recommended movies are:
['That Old Feeling (1997)', 'From Dusk Till Dawn (1996)', 'Trigger Effect, The (1996)', 'To Live (Huozhe) (1994)', 'Michael (1996)', 'Father of the Bride Part II (1995)', 'Notorious (1946)', 'Frankie Starlight (1995)', 'Sense and Sensibility (1995)', 'Dumb & Dumber (1994)', 'Heaven & Earth (1993)', 'American President, The (1995)', 'Sleepless in Seattle (1993)', 'Beauty and 

In [63]:
males_mae = calc_mae(data_test_males, average_movie_train_males_int, avg_males)
print(males_mae)

0.9192984769565504


In [64]:
males_recall_for_avg, males_recall_for_random, males_precision_for_avg, males_precision_for_random = recall_precision(user_movies_test_males_df, males_recommendations_avg, males_recommendations_random)
print(males_recall_for_avg)
print(males_recall_for_random)
print(males_precision_for_avg)
print(males_precision_for_random)

0.04132639074500518
0.015828286030274006
0.0992378048780488
0.03323170731707317


In [65]:
data_train_females = users_ratings_train.loc[users_ratings_train['gender'] == 'F']
data_test_females = users_ratings_test.loc[users_ratings_test['gender'] == 'F']
average_movie_train_females = data_train_females.groupby('movie_id')['rating'].mean()
average_movie_train_females_int = average_movie_train_females.astype(int)
avg_females = average_movie_train_females_int.values.mean()
avg_females = avg_females.astype(int)

average_movie_train_females_df = pd.DataFrame({'rating_avg': data_train_females.groupby('movie_id')['rating'].mean()}).reset_index()
average_movie_train_females_df.sort_values(by='rating_avg', ascending=False, inplace=True)
average_movie_train_females_df = average_movie_train_females_df.reset_index(drop=True)
user_movies_train_females_df = pd.DataFrame({'movies_list': data_train_females.groupby('user_id')['movie_id'].apply(list)}).reset_index()
user_movies_test_females_df = pd.DataFrame({'movies_list': data_test_females.groupby('user_id')['movie_id'].apply(list)}).reset_index()

average_movie_train_females_df_new = data_i.merge(average_movie_train_females_df, on='movie_id', how='inner')

In [66]:
females_recommendations_avg_title, females_recommendations_random_title, females_recommendations_avg, females_recommendations_random = user_recommendations(user_movies_train_females_df, average_movie_train_females_df_new)
print("Recommendations for females - average: ")
for i, rec in females_recommendations_avg_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")
print("")
print("Recommendations for females - random: ")
for i, rec in females_recommendations_random_title.items():
    print("For the " + str(i) + " user, the recommended movies are:")
    print(rec)
    print("")

Recommendations for females - average: 
For the 2 user, the recommended movies are:
['GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'From Dusk Till Dawn (1996)', 'White Balloon, The (1995)', "Antonia's Line (1995)", 'Angels and Insects (1995)', 'Muppet Treasure Island (1996)', 'Braveheart (1995)', 'Taxi Driver (1976)']

For the 5 user, the recommended movies are:
['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', 'Po

In [67]:
females_mae = calc_mae(data_test_females, average_movie_train_females_int, avg_females)
print(females_mae)

1.010552451893234


In [68]:
females_recall_for_avg, females_recall_for_random, females_precision_for_avg, females_precision_for_random = recall_precision(user_movies_test_females_df, females_recommendations_avg, females_recommendations_random)
print(females_recall_for_avg)
print(females_recall_for_random)
print(females_precision_for_avg)
print(females_precision_for_random)

0.04744935052422294
0.015958850204801443
0.07824427480916031
0.02748091603053435
