In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from imdb import Cinemagoer

In [10]:
movies = pd.read_csv(Path('archive\movies.csv'))

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
# seperating year of release from title
for idx in movies.index:
    title = movies.loc[idx,"title"]
    year_of_release = re.findall(r"\(([0-9]+-?\s?)\)", title)
    if year_of_release == []:
        movies.loc[idx,"year_of_release"] = np.NaN
    else:
        movies.loc[idx,"year_of_release"] = year_of_release[-1]

In [13]:
movies.isna().sum()

movieId              0
title                0
genres               0
year_of_release    327
dtype: int64

In [14]:
movies.loc[movies["year_of_release"].isna()]

Unnamed: 0,movieId,title,genres,year_of_release
10023,32930,Category 6: Day of Destruction,Action|Drama,
10613,40697,Babylon 5,Sci-Fi,
15719,79607,"Millions Game, The (Das Millionenspiel)",Action|Drama|Sci-Fi|Thriller,
17444,87442,"Bicycle, Spoon, Apple (Bicicleta, cullera, poma)",Documentary,
22651,107434,Diplomatic Immunity (2009– ),Comedy,
...,...,...,...,...
57504,192339,The Sign of Three,(no genres listed),
57674,192829,Parwaaz Hai Junoon,Adventure|Children|Romance,
57789,193149,¿Qué te juegas?,(no genres listed),
57903,193443,Wall,(no genres listed),


In [15]:
movies_new = pd.DataFrame(columns=["movieId","title","genres","year_of_release"])
movies_new.head()

Unnamed: 0,movieId,title,genres,year_of_release


In [16]:
for idx in movies.index:
    genre_list = movies.loc[idx,"genres"].split("|")
    for genre in genre_list:
        temp_df = pd.DataFrame(
            [[movies.loc[idx,"movieId"], movies.loc[idx,"title"],genre,movies.loc[idx,"year_of_release"]]],
            columns=["movieId","title","genres","year_of_release"]
            )
        movies_new = pd.concat([movies_new, temp_df], ignore_index= True)

In [17]:
movies_new.head()

Unnamed: 0,movieId,title,genres,year_of_release
0,1,Toy Story (1995),Adventure,1995
1,1,Toy Story (1995),Animation,1995
2,1,Toy Story (1995),Children,1995
3,1,Toy Story (1995),Comedy,1995
4,1,Toy Story (1995),Fantasy,1995


In [18]:
dict_genre = {'Adventure':0,'Animation':1,'Children':2, 'Comedy':3,'Fantasy':4,'Romance':5,'Drama':6,'Action':7,'Crime':8, 'Thriller':9,'Horror':10,
'Mystery':11,'Sci-Fi':12,'IMAX':13,'Documentary':14,'War':15,'Musical':16,'Western':17,'Film-Noir':18,'(no genres listed)':19}

In [19]:
movie_genre_join_table = pd.DataFrame(columns=["movieId","genreId"])
movie_genre_join_table

Unnamed: 0,movieId,genreId


In [20]:
for idx in movies_new.index:
    tag_id = dict_genre[movies_new.loc[idx,'genres']]
    movie_id = movies_new.loc[idx,"movieId"]
    temp = pd.DataFrame([[movie_id, tag_id]], columns=["movieId","genreId"])
    movie_genre_join_table = pd.concat([movie_genre_join_table, temp])

In [21]:
movie_genre_join_table.head()

Unnamed: 0,movieId,genreId
0,1,0
0,1,1
0,1,2
0,1,3
0,1,4


In [22]:
genre_set = set(movies_new["genres"].values)
genre_set

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [23]:
dict_genre[np.NAN] = dict_genre["(no genres listed)"]

In [24]:
genres = pd.DataFrame(columns=["genreId", "genre"])
for genre in genre_set:
    temp_df = pd.DataFrame([[dict_genre[genre], genre]], columns=["genreId", "genre"])
    genres = pd.concat([genres, temp_df],ignore_index = True)

In [25]:
genres = genres.sort_values(by ='genreId', ascending = 1)

In [26]:
genres = genres.reset_index()

In [27]:
genres.drop(columns=["index"],inplace = True)

In [28]:
genres.loc[genres["genre"] == "(no genres listed)","genre"] = np.NAN

In [29]:
genres

Unnamed: 0,genreId,genre
0,0,Adventure
1,1,Animation
2,2,Children
3,3,Comedy
4,4,Fantasy
5,5,Romance
6,6,Drama
7,7,Action
8,8,Crime
9,9,Thriller


In [30]:
movies_new.drop(columns = ["genres"],inplace = True)

In [31]:
movies_new = movies_new.drop_duplicates()
movies_new = movies_new.reset_index().drop(columns = ["index"])

In [32]:
movies_new.loc[movies_new["year_of_release"].isna()]

Unnamed: 0,movieId,title,year_of_release
10023,32930,Category 6: Day of Destruction,
10613,40697,Babylon 5,
15719,79607,"Millions Game, The (Das Millionenspiel)",
17444,87442,"Bicycle, Spoon, Apple (Bicicleta, cullera, poma)",
22651,107434,Diplomatic Immunity (2009– ),
...,...,...,...
57504,192339,The Sign of Three,
57674,192829,Parwaaz Hai Junoon,
57789,193149,¿Qué te juegas?,
57903,193443,Wall,


In [33]:
movies_new.loc[22651,"year_of_release"] = 2009

In [34]:
def munge_title(title):
    i = title.rfind(' (')
    if i != -1:
        title = title[:i]
    for suff_word in ['The', 'A', 'An']:
        suffix = ', {}'.format(suff_word)
        if title.endswith(suffix):
            title = suff_word + ' ' + title[:-len(suffix)]
    return title

In [35]:
for idx in movies_new.index:
    movies_new.loc[idx,"title"] = munge_title(movies_new.loc[idx,"title"])

In [36]:
movies_new.head()

Unnamed: 0,movieId,title,year_of_release
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [37]:
movies_new.loc[movies_new["year_of_release"].isna()]

Unnamed: 0,movieId,title,year_of_release
10023,32930,Category 6: Day of Destruction,
10613,40697,Babylon 5,
15719,79607,The Millions Game,
17444,87442,"Bicycle, Spoon, Apple",
24089,112406,Brazil: In the Shadow of the Stadiums,
...,...,...,...
57504,192339,The Sign of Three,
57674,192829,Parwaaz Hai Junoon,
57789,193149,¿Qué te juegas?,
57903,193443,Wall,


In [51]:
links = pd.read_csv(Path("archive/links.csv"))
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [41]:
## Get movie year from IMDB
imdb_year_1 = []
count=0
# create an instance of the Cinemagoer class
ia = Cinemagoer()

for x,m in zip(movies_new.loc[movies_new["year_of_release"].isna(),'movieId'],movies_new.loc[movies_new["year_of_release"].isna(),'title']):
    y = links[links['movieId']==x]['imdbId'].values[0]
     #################### DO NOT TOUCH THIS CODE #####################
    # get a movie
    movie = ia.get_movie(y)
    try:
        movies_new.loc[movies_new["movieId"] == x, 'year_of_release'] = movie['year']
    except:
        print("Id:",x,"Movie:",m,"ImdbId:",y)
        imdb_year_1.append([x,m,0])
    if (count%10==0):
        print(count,"------- Done ----------")
        print(m,movie['year'])
    count+=1
    
    #########

0 ------- Done ----------
Tatort: Im Schmerz geboren 2014
10 ------- Done ----------
La vendetta dei barbari 1960
20 ------- Done ----------
One Night Only 1984
30 ------- Done ----------
Vaastupurush 2002
40 ------- Done ----------
Chinese Boxes 1984
50 ------- Done ----------
Disaster Playground 2015
60 ------- Done ----------
Roger la Honte 1966
70 ------- Done ----------
Alone With People 2014
80 ------- Done ----------
Nocturnal Animals 2016
90 ------- Done ----------
Beauty and the Breast 2012
100 ------- Done ----------
Vergeef me 2001
110 ------- Done ----------
Pawn's Move 2011
120 ------- Done ----------
Citizen King 2004
130 ------- Done ----------
After Eden 2015
140 ------- Done ----------
Tibetana 1970
150 ------- Done ----------
Bad Dad Rehab 2016
160 ------- Done ----------
The Adventures of Cinderella's Daughter 2000
170 ------- Done ----------
Dolpo Tulku - Heimkehr in den Himalaya 2010
180 ------- Done ----------
Third Guest 2016
190 ------- Done ----------
Jedi Juni

In [42]:
movies_new.loc[movies_new["year_of_release"].isna()]

Unnamed: 0,movieId,title,year_of_release
50874,177265,Checkmate,
56064,188661,Untitled Star Trek Sequel,


In [43]:
movie_genre_join_table = movie_genre_join_table.reset_index().drop(columns=["index"])

In [44]:
movie_genre_join_table = movie_genre_join_table.drop_duplicates()

In [45]:
movie_genre_join_table.loc[movie_genre_join_table["genreId"] == 19]

Unnamed: 0,movieId,genreId
34699,83773,19
34728,83829,19
35071,84768,19
35672,86493,19
35973,87061,19
...,...,...
106048,193815,19
106074,193849,19
106080,193855,19
106094,193870,19


In [46]:
movies_new.to_csv(Path("preproc_datasets/movies.csv"),index=False)

In [47]:
movie_genre_join_table.to_csv(Path("preproc_datasets/movie_genres.csv"),index = False)

In [48]:
genres.to_csv(Path("preproc_datasets/genres.csv"))

In [52]:
movie_genre_join_table =pd.read_csv(Path("preproc_datasets/movie_genres.csv"))

In [53]:
for movieid in movie_genre_join_table.loc[movie_genre_join_table["genreId"] == 19, "movieId"].values:
    imdb_id = links.loc[links["movieId"] == movieid,"imdbId"].values[0]
    try:
        data = ia.get_movie(imdb_id)
        new_genres = data["genres"]
        movie_genre_join_table.drop(movie_genre_join_table.loc[movie_genre_join_table["movieId"] == movieid].index, inplace = True)
        counter = 20
        for _ in new_genres:
            if dict_genre.get(_) is None:
                dict_genre[_] = counter
                counter += 1
                print(_)
            new_row = pd.DataFrame([[movieid,dict_genre.get(_)]],columns=["movieId","genreId"])
            movie_genre_join_table = pd.concat([new_row,movie_genre_join_table])
        print(movieid)
    except:
        print("no genre avail",movieid)

83773
Short
Music
83829
84768
History
86493
87061
91246
92435
92641
94431
94657
95541
95750
96479
Biography
96651
113472
113545
114335
114587
114723
114725
114877
115004
115006
no genre avail 115419
115441
115529
115602
115893
Family
116046
116054
116096
116098
116100
116126
116237
116287
116347
116425
116479
116531
116704
Adult
116917
116923
116949
116955
117316
117722
117845
117847
117865
118292
118340
118448
118692
Sport
118728
118878
119878
120299
120612
121581
121590
121867
122001
122482
122557
122571
122573
122589
122605
122637
122675
122679
122687
122711
122811
122825
122888
122896
122944
122988
123010
123038
123044
123052
123061
123071
123095
123117
123135
123139
123250
123254
123274
123278
123290
123333
123339
123345
123371
123397
123415
123421
123439
123441
123518
123526
123559
123602
123607
123609
123617
123619
123625
123629
123693
123709
123715
123721
123729
123755
123775
123798
123800
123808
123810
123830
123838
123850
123860
123866
123870
123874
123902
123913
123925
12393

2022-11-27 14:13:48,223 CRITICAL [imdbpy] d:\IITPkd\Data Eng\Project\myenv\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt3416042/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 404: ''>},); kwds: {}
Traceback (most recent call last):
  File "d:\IITPkd\Data Eng\Project\myenv\lib\site-packages\imdb\parser\http\__init__.py", line 221, in retrieve_unicode
    response = uopener.open(url)
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 525, in open
    response = meth(req, response)
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 634, in http_response
    response = self.parent.error(
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 563, in error
    return self._call_chain(*args)
  File "C:\Users\nukeb\AppData\

no genre avail 135539
135561
135573
135575
135581
135583
135611
135641
135643
135645
135663
135667
135691
135721
135731
135733
135737
135771
135773
135775
135783
135799
135819
135821
135891
135895
135917
135921
135935
136056
136074
136082
136090
136102
136104
136108
136148
136206
136241
136275
136277
136283
136285
136311
136371
136375
136381
136395
136401
136411
136415
136421
136453
136481
136524
136526
136560
136566
136570
136590
136592
136606
136610
136630
136706
136710
136736
136742
136746
136772
136774
136824
136832
136836
136842
136844
136866
136872
136874
136876
136878
136880
136886
136966
136970
136992
136994
137004
137030
137034
137056
137082
137084
137104
137120
137148
137162
137166
137180
137188
137240
137242
137270
137276
137284
137292
137296
137303
137333
137335
137361
137369
137375
137419
137437
137451
137453
137482
137488
137496
137572
137574
137576
137579
137583
137585
137587
137589
137591
137624
137644
137648
137660
137662
137666
137678
137741
137771
137803
137829
13783

2022-11-27 14:33:33,675 CRITICAL [imdbpy] d:\IITPkd\Data Eng\Project\myenv\lib\site-packages\imdb\__init__.py:833: caught an exception retrieving or parsing "main" info set for mopID "0015224" (accessSystem: http)
Traceback (most recent call last):
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 565, in _get_chunk_left
    chunk_left = self._read_next_chunk_size()
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 532, in _read_next_chunk_size
    return int(line, 16)
ValueError: invalid literal for int() with base 16: b''

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 582, in _read_chunked
    chunk_left = self._get_chunk_left()
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 567, in _get_chunk_left
    raise IncompleteRea

no genre avail 141301
no genre avail 141305
no genre avail 141323
no genre avail 141343
no genre avail 141355
no genre avail 141377


2022-11-27 14:33:33,891 CRITICAL [imdbpy] d:\IITPkd\Data Eng\Project\myenv\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2364953/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': URLError(gaierror(11001, 'getaddrinfo failed'))},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 1348, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1282, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1328, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Users\nukeb\AppData\Local\Programs\Python\Python310\lib\ht

no genre avail 141379
no genre avail 141381
no genre avail 141387
no genre avail 141389
no genre avail 141406
no genre avail 141436
no genre avail 141442
141444
141458
141466
141470
141472
141489
141507
141532
141558
141580
141598
141612
141638
141654
141664
141674
141682
141696
141745
141747
141765
141779
141858
141866
141872
141882
141894
141900
141936
141940
141948
141990
142000
142002
142034
142038
142042
142046
142098
142128
142142
142144
142214
142308
142378
142394
142400
142432
142434
142456
142460
142464
142490
142494
142496
142517
142544
142548
142556
142572
no genre avail 142610
142612
142614
142616
142620
142687
142714
142723
142748
142795
142801
142825
142837
142901
142911
142913
142937
142959
142977
142979
142987
142995
143007
143013
