In [1]:
import pandas as pd
from algorithms import *
import numpy as np

In [2]:
info = pd.read_csv('ml-100k/u.info', sep=' ', header=None)
info.columns = ['Count', 'Type']
info

Unnamed: 0,Count,Type
0,943,users
1,1682,items
2,100000,ratings


In [3]:
occupation = pd.read_csv('ml-100k/u.occupation', header=None)
occupation.columns = ['Occupation']
occupation

Unnamed: 0,Occupation
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


In [4]:
items = pd.read_csv('ml-100k/u.item' , header = None , sep = "|" , encoding='latin-1')

items.columns = ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              'Childrens' , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film_Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci_Fi' ,
              'Thriller' , 'War' , 'Western']

items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
data = pd.read_csv('ml-100k/u.data', header= None , sep = '\t')
data.columns = ['user id' , 'movie id' , 'rating' , 'timestamp']
data.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
user = pd.read_csv('ml-100k/u.user', header= None , sep = '|')
user.columns = ['user id' , 'age' , 'gender' , 'occupation' , 'zip code']
user.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
genre = pd.read_csv('ml-100k/u.genre', header= None , sep = '|' )
genre.columns = ['Genre' , 'genre_id']
genre

Unnamed: 0,Genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


### Item Attribute Matrix Generation

In [8]:
items_old_new = pd.DataFrame()
items_old_new['movie id'] = items['movie id']
items_old_new['timestamp'] = items['release date']
items_old_new.head()

Unnamed: 0,movie id,timestamp
0,1,01-Jan-1995
1,2,01-Jan-1995
2,3,01-Jan-1995
3,4,01-Jan-1995
4,5,01-Jan-1995


In [9]:
movie_attribute_dict = {}
attributes = items[5:]

for _, row in items.iterrows():
    movie_id = row['movie id']
    attribute_names = [col for col in attributes if row[str(col)] == 1]
    movie_attribute_dict[movie_id] = attribute_names


In [10]:
movie_attribute_dict[1].remove('movie id')
movie_attribute_dict

{1: ['Animation', 'Childrens', 'Comedy'],
 2: ['Action', 'Adventure', 'Thriller'],
 3: ['Thriller'],
 4: ['Action', 'Comedy', 'Drama'],
 5: ['Crime', 'Drama', 'Thriller'],
 6: ['Drama'],
 7: ['Drama', 'Sci_Fi'],
 8: ['Childrens', 'Comedy', 'Drama'],
 9: ['Drama'],
 10: ['Drama', 'War'],
 11: ['Crime', 'Thriller'],
 12: ['Crime', 'Thriller'],
 13: ['Comedy'],
 14: ['Drama', 'Romance'],
 15: ['Drama'],
 16: ['Comedy', 'Romance'],
 17: ['Action', 'Comedy', 'Crime', 'Horror', 'Thriller'],
 18: ['Drama'],
 19: ['Drama'],
 20: ['Drama', 'Romance'],
 21: ['Action', 'Adventure', 'Comedy', 'Musical', 'Thriller'],
 22: ['Action', 'Drama', 'War'],
 23: ['Drama', 'Thriller'],
 24: ['Action', 'Adventure', 'Crime'],
 25: ['Comedy'],
 26: ['Comedy'],
 27: ['Action'],
 28: ['Action', 'Drama', 'Thriller'],
 29: ['Action', 'Adventure', 'Comedy', 'Crime'],
 30: ['Drama'],
 31: ['Drama', 'Thriller', 'War'],
 32: ['Documentary'],
 33: ['Action', 'Romance', 'Thriller'],
 34: ['Comedy', 'Drama'],
 35: ['Adve

In [11]:
items_old_new['timestamp'] = pd.to_datetime(items_old_new['timestamp'], format='%d-%b-%Y')

items_old_new = items_old_new.sort_values(by='timestamp', ascending=True)

split_index = int(0.8 * len(items_old_new))

old_items = items_old_new.iloc[:split_index]['movie id'].values.tolist()
new_items = items_old_new.iloc[split_index:]['movie id'].values.tolist()

In [12]:
A_set = ('unknown' , 'Action' , 'Adventure' , 'Animation' ,
        'Childrens' , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
        'Film_Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci_Fi' ,
        'Thriller' , 'War' , 'Western')

In [13]:
IA_matrix = item_attribute_generation_matrix(old_items, new_items, movie_attribute_dict, A_set)
IA_matrix

{'old_items': {675: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
  1542: (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  617: (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  656: (0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0),
  1124: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),
  430: (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),
  1580: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
  835: (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
  1397: (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  604: (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  493: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
  615: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
  1203: (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
  671: (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
  1461: (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
len(IA_matrix['old_items']), len(IA_matrix['new_items'])

(1345, 337)

In [15]:
print('Total number of Items:', len(IA_matrix['old_items']) + len(IA_matrix['new_items']))

Total number of Items: 1682


In [16]:
ratings_based_similarity(data, 651, 700)

-0.041947253450875785

In [17]:
item1_ratings = data[data['movie id'] == 656]
item2_ratings = data[data['movie id'] == 453]

common_users = np.intersect1d(item1_ratings['user id'], item2_ratings['user id'])
if len(common_users) < 2:
    print(0)

item1_ratings = item1_ratings.loc[item1_ratings['user id'].isin(common_users)]
item2_ratings = item2_ratings.loc[item2_ratings['user id'].isin(common_users)]

item1_ratings = item1_ratings['rating'].to_numpy()
item2_ratings = item2_ratings['rating'].to_numpy()

item1_ratings_mean = np.mean(item1_ratings)
item2_ratings_mean = np.mean(item2_ratings)

numerator = np.sum((item1_ratings - item1_ratings_mean) * (item2_ratings - item1_ratings_mean))
denominator_x = np.sqrt(np.sum((item1_ratings - item1_ratings_mean) ** 2))
denominator_y = np.sqrt(np.sum((item2_ratings - item2_ratings_mean) ** 2))

denominator = denominator_x * denominator_y

if denominator != 0:
    pcc = numerator / denominator
    print(pcc)
else:
    print(0)

0.21566554640687682


In [18]:
old_item_attributes = np.array(IA_matrix['old_items'][657])
new_item_attributes = np.array(IA_matrix['new_items'][900])

old_item_attributes_mean = np.mean(old_item_attributes)
new_item_attributes_mean = np.mean(new_item_attributes)

denom1 = np.sum((old_item_attributes - old_item_attributes_mean) ** 2)
denom2 = np.sum((new_item_attributes - new_item_attributes_mean) ** 2)

denominator = np.sqrt(denom1) * np.sqrt(denom2)

numerator = np.sum((old_item_attributes - old_item_attributes_mean) * (new_item_attributes - new_item_attributes_mean))

if denominator != 0:
    similarity = numerator / denominator
    print(similarity)
else:
    print(0)

-0.08084520834544433


In [19]:
data.loc[(data['movie id'] == 242) & (data['user id'] == 196), 'rating']

0    3
Name: rating, dtype: int64

In [20]:
predicted_ratings = attribute_based_knn(IA_matrix, data, 2, list(user['user id'])[:5])

In [25]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [26]:
predicted_ratings

Unnamed: 0,user id,movie id,predicted rating
0,1,1420,3.610294
1,2,1420,3.709677
2,3,1420,2.796296
3,4,1420,4.333333
4,5,1420,2.874286
5,1,297,3.610294
6,2,297,3.709677
7,3,297,2.796296
8,4,297,4.333333
9,5,297,2.874286
