In [258]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [259]:
movies = pd.read_csv(Path("archive/movies.csv"))

In [260]:
print(movies.shape)

(58098, 3)


In [261]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [262]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [263]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [264]:
movies["year_of_release"] = 0

In [265]:
movies.head()

Unnamed: 0,movieId,title,genres,year_of_release
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0
4,5,Father of the Bride Part II (1995),Comedy,0


In [266]:
# seperating year of release from title
for idx in movies.index:
    title = movies.loc[idx,"title"]
    year_of_release = re.findall(r"\([0-9]\)", title)
    for year in year_of_release:
        if year_of_release is None:
            movies.loc[idx,"year_of_release"] = np.NaN
        else:
            movies.loc[idx,"year_of_release"] = year


In [267]:
movies.isnull().sum()

movieId            0
title              0
genres             0
year_of_release    0
dtype: int64

In [268]:
movies.loc[movies["year_of_release"].isnull()]

Unnamed: 0,movieId,title,genres,year_of_release


In [269]:
movies.loc[movies.title == "Trails (Veredas) (1978)"]

Unnamed: 0,movieId,title,genres,year_of_release
17343,87061,Trails (Veredas) (1978),(no genres listed),0


In [270]:
movies_new = pd.DataFrame(columns=["movieId","title","genre","year_of_release"])
movies_new.head()

Unnamed: 0,movieId,title,genre,year_of_release


In [271]:
for idx in movies.index:
    genre_list = movies.loc[idx,"genres"].split("|")
    for genre in genre_list:
        temp_df = pd.DataFrame(
            [[movies.loc[idx,"movieId"], movies.loc[idx,"title"],genre,movies.loc[idx,"year_of_release"]]],
            columns=["movieId","title","genre","year_of_release"]
            )
        movies_new = pd.concat([movies_new, temp_df], ignore_index= True)

In [272]:
movies_new.head()

Unnamed: 0,movieId,title,genre,year_of_release
0,1,Toy Story (1995),Adventure,0
1,1,Toy Story (1995),Animation,0
2,1,Toy Story (1995),Children,0
3,1,Toy Story (1995),Comedy,0
4,1,Toy Story (1995),Fantasy,0


In [273]:
movies_new.head()

Unnamed: 0,movieId,title,genre,year_of_release
0,1,Toy Story (1995),Adventure,0
1,1,Toy Story (1995),Animation,0
2,1,Toy Story (1995),Children,0
3,1,Toy Story (1995),Comedy,0
4,1,Toy Story (1995),Fantasy,0


In [274]:
genre_set = set(movies_new["genre"].values)

In [275]:
genres = pd.DataFrame(columns=["genreId", "genre"])
i = 1
for genre in genre_set:
    temp_df = pd.DataFrame([[i, genre]], columns=["genreId", "genre"])
    genres = pd.concat([genres, temp_df],ignore_index = True)
    i += 1

In [276]:
genres 

Unnamed: 0,genreId,genre
0,1,Adventure
1,2,Action
2,3,IMAX
3,4,War
4,5,Film-Noir
5,6,Western
6,7,Documentary
7,8,Animation
8,9,Drama
9,10,Horror


In [277]:
movie_genre_join_table = pd.DataFrame(columns=["movieId","genreId"])
movie_genre_join_table

Unnamed: 0,movieId,genreId


In [278]:
dict_genre = {'Adventure':1,'Action':2,	'IMAX':3, 'War':4,'Film-Noir':5,'Western':6,'Documentary':7,'Animation':8,'Drama':9, 'Horror':10,'Mystery':11,
'Comedy':12,'Children':13,'Musical':14,'Romance':15,'Fantasy':16,'Sci-Fi':17,'Thriller':18,'(no genres listed)':19,'Crime':20}

In [279]:
for idx in movies_new.index:
    tag_id = dict_genre[movies_new.loc[idx,'genre']]
    movie_id = movies_new.loc[idx,"movieId"]
    temp = pd.DataFrame([[movie_id, tag_id]], columns=["movieId","genreId"])
    movie_genre_join_table = pd.concat([movie_genre_join_table, temp])

In [280]:
movie_genre_join_table.head()

Unnamed: 0,movieId,genreId
0,1,1
0,1,8
0,1,13
0,1,12
0,1,16


In [281]:
movies_new.drop(columns=["genre"],inplace=True)

In [282]:
movies_new.head()

Unnamed: 0,movieId,title,year_of_release
0,1,Toy Story (1995),0
1,1,Toy Story (1995),0
2,1,Toy Story (1995),0
3,1,Toy Story (1995),0
4,1,Toy Story (1995),0


In [283]:
movies_new = movies_new.drop_duplicates()
movies_new.head()

Unnamed: 0,movieId,title,year_of_release
0,1,Toy Story (1995),0
5,2,Jumanji (1995),0
8,3,Grumpier Old Men (1995),0
10,4,Waiting to Exhale (1995),0
13,5,Father of the Bride Part II (1995),0


In [284]:
movies_new = movies_new.reset_index()

In [285]:
movies_new.drop(columns=["index"],inplace = True)

In [286]:
movies_new.head()

Unnamed: 0,movieId,title,year_of_release
0,1,Toy Story (1995),0
1,2,Jumanji (1995),0
2,3,Grumpier Old Men (1995),0
3,4,Waiting to Exhale (1995),0
4,5,Father of the Bride Part II (1995),0


In [287]:
movie_genre_join_table = movie_genre_join_table.reset_index()

In [288]:
movie_genre_join_table.drop(columns=["index"],inplace = True)

In [289]:
movie_genre_join_table.head()

Unnamed: 0,movieId,genreId
0,1,1
1,1,8
2,1,13
3,1,12
4,1,16


In [290]:
genres.head()

Unnamed: 0,genreId,genre
0,1,Adventure
1,2,Action
2,3,IMAX
3,4,War
4,5,Film-Noir


In [291]:
movies_new.head()

Unnamed: 0,movieId,title,year_of_release
0,1,Toy Story (1995),0
1,2,Jumanji (1995),0
2,3,Grumpier Old Men (1995),0
3,4,Waiting to Exhale (1995),0
4,5,Father of the Bride Part II (1995),0


In [292]:
tags = pd.read_csv(Path("archive/tags.csv"))

In [293]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [294]:
empty_genre_list = movie_genre_join_table.loc[movie_genre_join_table["genreId"] == 19]

In [295]:
empty_genre_list.head()

Unnamed: 0,movieId,genreId
34699,83773,19
34728,83829,19
35071,84768,19
35672,86493,19
35973,87061,19


for idx in empty_genre_list.index:
    movie_id = empty_genre_list.loc[idx,"movieId"]
    relevant_tag_list = tags.loc[tags["movieId"] == movie_id,"tag"]
    genre_list = np.empty(shape=(1,1))
    for tag in relevant_tag_list:
        mvi_list = tags.loc[tags["tag"] == tag, "movieId"]
        for mv in mvi_list:
            genre_list = np.append(genre_list, movie_genre_join_table.loc[movie_genre_join_table["movieId"] == mv, "genreId"].values)

    vals, counts = np.unique(genre_list, return_counts=True)
    mode_value = np.argwhere(counts == np.max(counts))

    # print(mode_value[0,0])v
    if mode_value[0,0] != 0 and mode_value[0,0] != 19:
        print(mode_value[0,0])
        movie_genre_join_table.loc[movie_genre_join_table["movieId"] == movie_id,"genreId"] = mode_value[0,0]