![](images/dataset-cover.jpeg)

# MovieLens20M dataset

In [1]:
import pandas as pd
import numpy as np
import pickle

from collections import Counter

## 1.1 Read relevant files from the data folder

In [2]:
df_genome_scores = pd.read_csv('data/genome_scores.csv', index_col=False)
df_genome_tags = pd.read_csv('data/genome_tags.csv', index_col=False)
df_link = pd.read_csv('data/link.csv', index_col=False)
df_movie = pd.read_csv('data/movie.csv', index_col=False)
df_rating = pd.read_csv('data/rating.csv', index_col=False).drop(columns=['timestamp'])
df_tags = pd.read_csv('data/tag.csv', index_col=False)

## 1.2 Preprocessing ratings.csv dataset

The following have been implemented as part of preliminary pre-processing:
* User-Ids go from 1 to 138492, without any gaps. The ids have been shifted to start from 0.
* There are fewer unique movieIds than max ID value, so they have been re-indexed.

The new re-indexed movieIDs are used to make up a new column. The final DataFrame is saved.

In [3]:
df_rating['userId'] -= 1

unique_movie_ids = set(df_rating.movieId.values)

movieIdToIndex = {}
count = 0

for movie_id in unique_movie_ids:
    movieIdToIndex[movie_id] = count
    count += 1
    
df_rating['movieIndex'] = df_rating.apply(lambda x: movieIdToIndex[x.movieId], axis=1)

df_rating.to_csv("data/edited_rating.csv")

In [4]:
df_rating

Unnamed: 0,userId,movieId,rating,movieIndex
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50
...,...,...,...,...
20000258,138492,68954,4.5,13821
20000259,138492,69526,4.5,13929
20000260,138492,69644,3.0,13942
20000261,138492,70286,5.0,14060


## 1.3 A shrinked pre-processed dataset

In [5]:
df = pd.read_csv("data/edited_rating.csv", index_col=0)

In [6]:
df

Unnamed: 0,userId,movieId,rating,movieIndex
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50
...,...,...,...,...
20000258,138492,68954,4.5,13821
20000259,138492,69526,4.5,13929
20000260,138492,69644,3.0,13942
20000261,138492,70286,5.0,14060


In [24]:
M = df.movieIndex.max()
N = df.userId.max()

user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movieId)

m = 2000
n = 10000

In [25]:
top_user_ids = [u for u, c in user_ids_count.most_common(n)]
top_movie_ids = [m for m, c in movie_ids_count.most_common(m)]

df_shrinked = df[df.userId.isin(top_user_ids) & df.movieId.isin(top_movie_ids)].copy()


The userIDs and movieIDs need to be re-indexed since many users and movies have been removed

In [26]:
new_user_id_map = {}
i = 0
for old in top_user_ids:
    new_user_id_map[old] = i
    i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in top_movie_ids:
    new_movie_id_map[old] = j
    j += 1
print("j:", j)

i: 10000
j: 2000


In [27]:
df_shrinked.loc[:, 'userId'] = df_shrinked.apply(lambda x: new_user_id_map[x.userId], axis=1)
df_shrinked.loc[:, 'movieId'] = df_shrinked.apply(lambda x: new_movie_id_map[x.movieId], axis=1)

In [28]:
df_shrinked.userId.max(), df_shrinked.movieId.max()

(9999, 1999)

In [29]:
df_shrinked.to_csv("data/shrinked_rating.csv")