In [1]:
# process original IMDb Dataset

In [2]:
import pandas as pd
import numpy as np
from utilities import convert_str_to_bitvalue, convert_tconst_to_indexnum, convert_enum_to_number

In [3]:
df = pd.read_csv("imdb-dataset/movies.tsv", sep='\t')

In [4]:
# correct runtimeMinutes / isAdult column: they contains faulty values
df.loc[df['runtimeMinutes'].astype(str).str.contains('[^0-9]+') & (df['runtimeMinutes'].astype(str).str.len() > 3), 'runtimeMinutes'] = '\\N'
df.loc[~df['isAdult'].isin([0, 1, '0', '1', '\\N']), 'isAdult'] = '\\N'

KeyError: 'runtimeMinutes'

In [5]:
df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
2,tt0000003,6.5,1807
3,tt0000004,5.6,178
4,tt0000005,6.2,2604
...,...,...,...
1301737,tt9916730,8.3,10
1301738,tt9916766,7.0,21
1301739,tt9916778,7.2,36
1301740,tt9916840,8.8,6


In [6]:
df['titleType'] = df['titleType'].map(lambda a: convert_enum_to_number(a))

In [7]:
df['tconst'] = df['tconst'].map(lambda a: convert_tconst_to_indexnum(a))

In [8]:
# calculation takes about 3 min
df['genres'] = df['genres'].map(lambda a: convert_str_to_bitvalue(a))

In [9]:
df.set_index('tconst', inplace=True)

In [10]:
df_ratings = pd.read_csv("imdb-dataset/ratings.tsv", sep='\t')

In [11]:
df_ratings['tconst'] = df_ratings['tconst'].map(lambda a: convert_tconst_to_indexnum(a))

In [12]:
df_ratings.set_index('tconst', inplace=True)

In [13]:
result_df = df.join(df_ratings)

In [14]:
result_df['averageRating'] = result_df['averageRating'].replace(np.NaN, "\\N")

In [15]:
result_df['numVotes'] = result_df['numVotes'].replace(np.NaN, "\\N")

In [16]:
result_df['genres'] = pd.to_numeric(result_df['genres'], errors='coerce')
result_df['genres'] = result_df['genres'].replace(np.NaN, "\\N")

In [17]:
result_df.index.names = ['id']

In [18]:
mapping = {
    "titleType":"movie_type",
    "primaryTitle":"primary_title", 
    "originalTitle":"original_title", 
    "isAdult":"adult", 
    "startYear":"start_year", 
    "endYear":"end_year", 
    "runtimeMinutes":"runtime_minutes", 
    "genres":"movie_genre",
    "averageRating":"imdb_rating",
    "numVotes":"imdb_rating_count",
}

In [19]:
result_df = result_df.rename(columns=mapping)

In [3]:
# merge description / url token dataset into this one
result_df = pd.read_csv("processed_imdb_movies.csv")
result_df.set_index('id', inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df_image_description = pd.read_csv("imdb-dataset/movies_image_url_description.csv")

In [6]:
df_image_description = df_image_description.drop('poster_path', axis=1)
df_image_description = df_image_description.drop('primaryTitle', axis=1)
df_image_description = df_image_description.drop('imdbId', axis=1)

In [7]:
df_image_description.rename(columns={'overview': 'description', 'url_token': 'image_url_token'}, inplace=True)

In [8]:
df_image_description.set_index('id', inplace=True)

In [9]:
# cut_df = df_image_description.loc[:9700940]
cut_df = df_image_description.loc[:9018776]

In [10]:
end_df = result_df.join(cut_df)

In [11]:
end_df['description'].replace(np.NaN, "\\N", inplace=True)
end_df['image_url_token'].replace(np.NaN, "\\N", inplace=True)

In [12]:
end_df

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count,description,image_url_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,Carmencita,Carmencita,0,1894,\N,1,270532609.0,5.7,1892.0,\N,\N
2,0,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,268437505.0,5.9,252.0,\N,\N
3,0,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,35841.0,6.5,1685.0,\N,\N
4,0,Un bon bock,Un bon bock,0,1892,\N,12,268437505.0,5.7,165.0,\N,\N
5,0,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,268436481.0,6.2,2499.0,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...
9916848,4,Episode #3.17,Episode #3.17,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916850,4,Episode #3.19,Episode #3.19,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916852,4,Episode #3.20,Episode #3.20,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916856,0,The Wind,The Wind,0,2015,\N,27,268435457.0,\N,\N,\N,\N


In [13]:
end_df.to_csv('processed_imdb_movies_end.csv')

In [32]:
end_df

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count,description,image_url_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,Carmencita,Carmencita,0,1894,\N,1,270532609.0,5.7,1959.0,\N,\N
2,0,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,268437505.0,5.8,263.0,\N,\N
3,0,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,35841.0,6.5,1799.0,\N,\N
4,0,Un bon bock,Un bon bock,0,1892,\N,12,268437505.0,5.6,179.0,\N,\N
5,0,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,268436481.0,6.2,2596.0,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...
9916848,4,Episode #3.17,Episode #3.17,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916850,4,Episode #3.19,Episode #3.19,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916852,4,Episode #3.20,Episode #3.20,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916856,0,The Wind,The Wind,0,2015,\N,27,268435457.0,\N,\N,\N,\N


id
1          270532609.0
2          268437505.0
3              35841.0
4          268437505.0
5          268436481.0
              ...     
9916848        17025.0
9916850        17025.0
9916852        17025.0
9916856    268435457.0
9916880         3329.0
Name: movie_genre, Length: 9700941, dtype: float64

In [44]:
# Convert column 'col1' of DataFrame 'df' to strings and filter rows with length less than 2
end_df['movie_genre'] = end_df['movie_genre'].astype(str)
df_filtered = end_df[~end_df['movie_genre'].str.contains(r'^\\N$') & (end_df['movie_genre'].str.len() <= 2)]


In [45]:
df_filtered

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count,description,image_url_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [39]:
new_df = end_df.drop('description', axis=1)
new_df = new_df.drop('image_url_token', axis=1)

In [46]:
end_df

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count,description,image_url_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,Carmencita,Carmencita,0,1894,\N,1,270532609.0,5.7,1959.0,\N,\N
2,0,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,268437505.0,5.8,263.0,\N,\N
3,0,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,35841.0,6.5,1799.0,\N,\N
4,0,Un bon bock,Un bon bock,0,1892,\N,12,268437505.0,5.6,179.0,\N,\N
5,0,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,268436481.0,6.2,2596.0,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...
9916848,4,Episode #3.17,Episode #3.17,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916850,4,Episode #3.19,Episode #3.19,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916852,4,Episode #3.20,Episode #3.20,0,2010,\N,\N,17025.0,\N,\N,\N,\N
9916856,0,The Wind,The Wind,0,2015,\N,27,268435457.0,\N,\N,\N,\N


In [48]:
mini_df

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count,description,image_url_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,Carmencita,Carmencita,0,1894,\N,1,270532609.0,5.7,1959.0,\N,\N
2,0,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,268437505.0,5.8,263.0,\N,\N
3,0,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,35841.0,6.5,1799.0,\N,\N
4,0,Un bon bock,Un bon bock,0,1892,\N,12,268437505.0,5.6,179.0,\N,\N
5,0,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,268436481.0,6.2,2596.0,\N,\N


In [41]:
new_df.to_csv('processed_imdb_movies.csv')

In [4]:
result_df

Unnamed: 0_level_0,movie_type,primary_title,original_title,adult,start_year,end_year,runtime_minutes,movie_genre,imdb_rating,imdb_rating_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,Carmencita,Carmencita,0,1894,\N,1,270532609.0,5.7,1892.0
2,0,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,268437505.0,5.9,252.0
3,0,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,35841.0,6.5,1685.0
4,0,Un bon bock,Un bon bock,0,1892,\N,12,268437505.0,5.7,165.0
5,0,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,268436481.0,6.2,2499.0
...,...,...,...,...,...,...,...,...,...,...
9916848,4,Episode #3.17,Episode #3.17,0,2010,\N,\N,17025.0,\N,\N
9916850,4,Episode #3.19,Episode #3.19,0,2010,\N,\N,17025.0,\N,\N
9916852,4,Episode #3.20,Episode #3.20,0,2010,\N,\N,17025.0,\N,\N
9916856,0,The Wind,The Wind,0,2015,\N,27,268435457.0,\N,\N


In [None]:
# whats was successfull: using old dataset and add image token and descriptions to it, 5k movies have url