In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from ast import literal_eval

In [None]:
# Importing data 
top1000 = pd.read_csv(r'C:/Users/gizli/Desktop/imdb_top_1000.csv')
movies = pd.read_csv(r'C:/Users/gizli/Desktop/movies_metadata.csv')
name = pd.read_table('C:/Users/gizli/Desktop/name.tsv',sep='\t')
crew = pd.read_table('C:/Users/gizli/Desktop/crew.tsv',sep='\t')
ratings = pd.read_table('C:/Users/gizli/Desktop/ratings.tsv',sep='\t')

In [None]:
# Cleaning and preparing data
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies.year = movies.year.apply(lambda x: pd.NA if 'N' in str(x) else x)
movies.vote_average = movies.vote_average.apply(lambda x: pd.NA if '0.0' in str(x) else x)
movies.genres = movies.genres.apply(lambda x: pd.NA if '[]' in str(x) else x)


In [None]:
movies.dropna(subset=['genres'], how='all', inplace=True)
movies.dropna(subset=['imdb_id'], how='all', inplace=True)
movies.dropna(subset=['title'], how='all', inplace=True)
movies.dropna(subset=['overview'], how='all', inplace=True)
movies.dropna(subset=['year'], how='all', inplace=True)
movies.dropna(subset=['runtime'], how='all', inplace=True)
movies.dropna(subset=['vote_average'], how='all', inplace=True)
movies.drop(columns=['adult', 'belongs_to_collection', 'budget', 'homepage', 'id', 'original_language', 'original_title', 
                     'popularity', 'poster_path', 'production_companies', 'production_countries', 'revenue', 'spoken_languages',
                     'status', 'tagline', 'video', 'vote_count', 'release_date'], inplace=True)
movies.genres = movies.genres.apply(lambda x:  ",".join(x) )
movies['year'] = movies['year'].astype(int)
movies['runtime'] = movies['runtime'].astype(int)

In [None]:
# Updating data and sorting movies based on IMDB rating
ratings.rename(columns = {'tconst':'imdb_id', 'averageRating':'vote_average'}, inplace = True)
movies = movies.merge(ratings,on='imdb_id',how="left")

movies.rename(columns={'vote_average_y':'average_rating'}, inplace=True)
movies.drop(["vote_average_x", 'numVotes'],inplace=True,axis=1)
movies = movies.sort_values('average_rating', ascending = False)
movies.overview = movies.overview.apply(lambda x: pd.NA if 'No overview' in str(x) else x)
movies = movies.dropna()
movies = movies.reset_index()
movies = movies.drop('index', 1)

In [None]:
# Deletion of movies whose overview phrases are at least 20 letters
top_movies.drop(top_movies[top_movies['overview'].map(len) < 20].index, inplace=True)
# Limiting data to the first 35000 movies
top_movies = movies.iloc[:35000]
top_movies.drop(top_movies[top_movies['year'] < 1969].index, inplace=True)
top_movies.to_csv('top_movies.csv', index=False)

In [None]:
# Preparing director and writer data
crew.rename(columns = {'tconst':'imdb_id'}, inplace = True)
crew = top_movies.merge(crew,on='imdb_id',how="left")
crew = crew.drop(['genres', 'overview', 'runtime', 'title', 'year', 'average_rating'], axis=1)
crew.to_csv('crews.csv', index=False)

In [None]:
top_movies = pd.read_csv(r'C:/Users/gizli/Desktop/top_movies.csv')
crews = pd.read_csv(r'C:/Users/gizli/Desktop/crews.csv')

crew_list = []
directors = crews['directors']
for director in directors:
    director = director.split(",")
    for person in director:
        crew_list.append(person)

writers = crews['writers'] 
for writer in writers:
    person = writer.split(",")
    for person in writer:
        crew_list.append(person)
        
        
crew_list = list(set(crew_list))

In [None]:
person_data = {'nconst':crew_list}
persons = pd.DataFrame(person_data)  
persons = persons.merge(name,on='nconst',how="left")
persons = persons.drop(['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1)
persons.to_csv('persons.csv', index=False)

In [None]:
# Extracting keywords from overview sentences and adding them to the data
top_movies = pd.read_csv(r'C:/Users/gizli/Desktop/top_movies.csv')
data = top_movies.dropna(subset=['overview'])
data = data.reset_index(drop=True)

cv = CountVectorizer(stop_words='english')
lda = LatentDirichletAllocation(n_components=3,random_state=101)

keywords = []


for i in range (data.shape[0]):
    keyword = []
    text = ""
    text += data['overview'][i]
    text=[text]
    dtm  = cv.fit_transform(text)
    lda_fit  = lda.fit(dtm)
    for id_value, value in enumerate(lda_fit.components_):
        for index in value.argsort()[-5:]:
            keyword.append(cv.get_feature_names()[index])
    keywords.append(list(set(keyword)))

In [None]:
keywords_data_new = {'keywords': keywords}
keywords_data = pd.DataFrame(keywords_data_new)
keywords_data.to_csv('keywords_data.csv', index=False)

In [None]:
keyword_data = {'imdb_id': top_movies.imdb_id, 'keywords': keywords}
keywords = pd.DataFrame(keyword_data)
keywords.keywords = keywords.keywords.apply(lambda x:  ",".join(x) )
keywords_new = pd.DataFrame(keyword_data)
keywords_new.keywords = keywords_new.keywords.apply(lambda x:  ",".join(x) )
keywords_new.head()
keywords_new.to_csv('keywords_new.csv', index=False)

keywords['keywords'].str.split(',', expand=True).head(10)
keywords = pd.concat([keywords['imdb_id'], keywords['keywords'].str.split(',', expand=True)], axis=1)
keywords.rename(columns = {0:'keyword_1', 1:'keyword_2', 2:'keyword_3', 3:'keyword_4', 4:'keyword_5', 
                            5:'keyword_6', 6:'keyword_7', 7:'keyword_8', 8:'keyword_9', 9:'keyword_10', 10:'keyword_11'}
                 , inplace = True)
keywords.drop(['keyword_11'], axis=1, inplace= True)

In [None]:
dicti = {}
for i in range(len(keyword_data['imdb_id'])):
    try:
        dicti[keyword_data['imdb_id'][i]] = keyword_data['keywords'][i]
    except KeyError:
        continue
        
    
df = pd.DataFrame(columns = ['imdb_id', 'keyword'])
key_list = []
for key in dicti:
    for i in range (len(dicti[key])):
        pair = []
        pair.append(key)
        pair.append(dicti[key][i])
        key_list.append(pair)

for key in key_list:
    df.loc[len(df.index)] = [key[0], key[1]]
df.to_csv('keywords.csv', index=False)

In [None]:
# Separating genres from movies to another csv file
genres = top_movies.genres
imdb_id = top_movies.imdb_id
for i in range(len(genres)):
    try:
        genres[i] = genres[i].split(",")
    except KeyError:
        continue

genre_dict = {}
for i in range(len(imdb_id)):
    genre_dict[imdb_id[i]] = genres[i]

In [None]:
genre = pd.DataFrame(columns = ['imdb_id', 'genre'])
key_list = []
for key in genre_dict:
    for i in range (len(genre_dict[key])):
        pair = []
        pair.append(key)
        pair.append(genre_dict[key][i])
        key_list.append(pair)

for key in key_list:
    genre.loc[len(genre.index)] = [key[0], key[1]]
genre.to_csv('genres.csv', index=False)

In [None]:
genres = top_movies.genres
genre_list = []
for genre in genres:
    for key in genre:
        genre_list.append(key)
        
genre_list = list(set(genre_list))
genres_data = {'genre': genre_list}
genres_df = pd.DataFrame(genres_data)
genres_df.to_csv('genre_data.csv', index=False)

In [None]:
# Creating director and writer csv file
directors = crews.directors
writers = crews.writers
imdb_id = crews.imdb_id
for i in range(len(directors)):
    directors[i] = directors[i].split(",")
    
for i in range(len(writers)):
    writers[i] = writers[i].split(",")

director_dict = {}
for i in range(crews.shape[0]):
    director_dict[imdb_id[i]] = directors[i]
    
writer_dict = {}
for i in range(crews.shape[0]):
    writer_dict[imdb_id[i]] = writers[i]

In [None]:
director = pd.DataFrame(columns = ['imdb_id', 'director'])
key_list = []
for key in director_dict:
    for i in range (len(director_dict[key])):
        pair = []
        pair.append(key)
        pair.append(director_dict[key][i])
        key_list.append(pair)

for key in key_list:
    director.loc[len(director.index)] = [key[0], key[1]]

director.director = director.director.apply(lambda x: None if 'N' in str(x) else x)
director.to_csv('director.csv', index=False)

In [None]:
writer = pd.DataFrame(columns = ['imdb_id', 'writer'])
key_list = []
for key in writer_dict:
    for i in range (len(writer_dict[key])):
        pair = []
        pair.append(key)
        pair.append(writer_dict[key][i])
        key_list.append(pair)

for key in key_list:
    writer.loc[len(writer.index)] = [key[0], key[1]]

writer.writer = writer.writer.apply(lambda x: None if 'N' in str(x) else x)
writer.to_csv('writer.csv', index=False)

In [None]:
# Creating keyword csv file
keywords_data = pd.read_csv(r'keywords.csv')
keyword_list = keywords_data.keyword
keyword_list = set(keyword_list)
keyword_data_df = pd.DataFrame(data=keyword_list, columns=['keywords'])
keyword_data_df.to_csv('keyword_data.csv', index=False)