In [1]:
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip

--2023-06-04 00:42:30--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2023-06-04 00:42:31 (9.09 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [2]:
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [1]:
import pandas as pd

In [2]:
movies_df = pd.read_csv(
    "ml-1m/movies.dat",
    sep = "::",
    names = ['movieId', 'title', 'genres'],
    encoding='latin-1',
    engine='python',
)

In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
ratings_df = pd.read_csv(
    'ml-1m/ratings.dat',
    sep = "::",
    names = ['UserID', 'movieId', 'rating', 'timestamp'],
    encoding='latin-1',
    engine='python',
)

In [5]:
ratings_df

Unnamed: 0,UserID,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
user_df = pd.read_csv(
    "ml-1m/users.dat",
    sep = "::",
    names = ["UserID","Gender","Age","Occupation","Zip-code"],
    encoding='latin-1',
    engine='python',
)

In [7]:
user_df

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [8]:
rating_user_df = pd.merge(ratings_df,user_df,on="UserID")

In [9]:
rating_user_df

Unnamed: 0,UserID,movieId,rating,timestamp,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106
1000205,6040,1094,5,956704887,M,25,6,11106
1000206,6040,562,5,956704746,M,25,6,11106
1000207,6040,1096,4,956715648,M,25,6,11106


In [10]:
user_rating_movie_df = pd.merge(rating_user_df, movies_df, on="movieId")

In [11]:
user_rating_movie_df

Unnamed: 0,UserID,movieId,rating,timestamp,Gender,Age,Occupation,Zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


In [13]:
max_genres = 10

In [14]:
unrolled_genres = user_rating_movie_df.genres.str.rsplit("|",n=max_genres,expand=True)

In [15]:
unrolled_genres

Unnamed: 0,0,1,2,3,4,5
0,Drama,,,,,
1,Drama,,,,,
2,Drama,,,,,
3,Drama,,,,,
4,Drama,,,,,
...,...,...,...,...,...,...
1000204,Documentary,,,,,
1000205,Drama,,,,,
1000206,Drama,,,,,
1000207,Comedy,Drama,Western,,,


In [16]:
len_df= len(unrolled_genres.columns)

In [17]:
unrolled_genres[['genres_'+str(i) for i in range(len_df)]] = unrolled_genres[[int(i) for i in range(len_df)]]

In [18]:
unrolled_genres.drop([int(i) for i in range(len_df)],axis = 1, inplace = True)

In [19]:
unrolled_genres

Unnamed: 0,genres_0,genres_1,genres_2,genres_3,genres_4,genres_5
0,Drama,,,,,
1,Drama,,,,,
2,Drama,,,,,
3,Drama,,,,,
4,Drama,,,,,
...,...,...,...,...,...,...
1000204,Documentary,,,,,
1000205,Drama,,,,,
1000206,Drama,,,,,
1000207,Comedy,Drama,Western,,,


In [20]:
user_rating_movie_genre_df = pd.merge(user_rating_movie_df,unrolled_genres,left_index=True, right_index=True)
user_rating_movie_genre_df.drop(["genres"],axis=1, inplace= True)

In [21]:
user_rating_movie_genre_df

Unnamed: 0,UserID,movieId,rating,timestamp,Gender,Age,Occupation,Zip-code,title,genres_0,genres_1,genres_2,genres_3,genres_4,genres_5
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,,,,,
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,,,,,
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,,,,,
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,,,,,
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,,,,,
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,,,,,
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,,,,,
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy,Drama,Western,,,


In [None]:
# user_rating_movie_df.to_csv("user_rating_movie_genre_merged.csv")