In [1]:
# import required packages

import pandas as pd
import numpy as np
import nltk
import os.path
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.tokenize import word_tokenize

In [2]:
# convert movies.csv file to dataframe

movies = pd.read_csv('movies.csv')
display(movies.shape, movies.head())

(62423, 3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# find out how many unique genres there are

series_genres = movies['genres'].str.split('|')
all_genres = []
for i in range(movies.shape[0]):
    all_genres += series_genres[i]

list_unique_genres = list(set(all_genres))
print(list_unique_genres)
print('number of genres =', len(list_unique_genres))

['Thriller', 'Animation', 'Fantasy', 'Crime', 'Horror', 'Action', 'Mystery', 'Adventure', 'Western', '(no genres listed)', 'Comedy', 'Musical', 'Sci-Fi', 'War', 'Drama', 'Documentary', 'Film-Noir', 'IMAX', 'Children', 'Romance']
number of genres = 20


In [4]:
# remove the rows without genre, "no genre listed"

movies = movies.drop(movies.loc[movies['genres'] =='(no genres listed)'].index)
movies.shape

(57361, 3)

In [5]:
# separate multiple-genre movies in multiple rows with a single genre

movies['genres'] = movies['genres'].str.split('|')
movies_single_genre = movies.explode('genres', ignore_index=True)
display(movies_single_genre.iloc[:10,:])

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
1,1,Toy Story (1995),Animation
2,1,Toy Story (1995),Children
3,1,Toy Story (1995),Comedy
4,1,Toy Story (1995),Fantasy
5,2,Jumanji (1995),Adventure
6,2,Jumanji (1995),Children
7,2,Jumanji (1995),Fantasy
8,3,Grumpier Old Men (1995),Comedy
9,3,Grumpier Old Men (1995),Romance


In [6]:
# put all movie titles of a certain genre in a row using groupby

grouped_movies = movies_single_genre.groupby("genres")
grouped_movies_genre = grouped_movies["title"].apply(list)
grouped_movies_genre = grouped_movies_genre.reset_index()
display(grouped_movies_genre)

Unnamed: 0,genres,title
0,Action,"[Heat (1995), Sudden Death (1995), GoldenEye (..."
1,Adventure,"[Toy Story (1995), Jumanji (1995), Tom and Huc..."
2,Animation,"[Toy Story (1995), Balto (1995), Pocahontas (1..."
3,Children,"[Toy Story (1995), Jumanji (1995), Tom and Huc..."
4,Comedy,"[Toy Story (1995), Grumpier Old Men (1995), Wa..."
5,Crime,"[Heat (1995), Casino (1995), Money Train (1995..."
6,Documentary,"[Across the Sea of Time (1995), Nico Icon (199..."
7,Drama,"[Waiting to Exhale (1995), American President,..."
8,Fantasy,"[Toy Story (1995), Jumanji (1995), City of Los..."
9,Film-Noir,"[Devil in a Blue Dress (1995), Suture (1993), ..."


In [7]:
# create a dictionary with genres as keys and tiltes of each genres as value in a way that each key has a string of all
# its corresponding titles

def str_per_genre(df):
    
    dic_titles_per_genre = dict()
    stop_words = set(stopwords.words('english')) # to remove prepositions, auxiliary verbs, and the like
    words = set(nltk.corpus.words.words()) # to ignore nonsense words
    
    for i in range(df.shape[0]):
        list_titles = df.iloc[i,1]
        str_titles = ' '.join(list_titles)
        list_words_per_genre = [w for w in nltk.word_tokenize(str_titles) if w.lower() in words if w.isalpha() if not w.lower() in stop_words]
        dic_titles_per_genre[df.iloc[i,0]] = ' '.join(list_words_per_genre)
        
    return dic_titles_per_genre

dict_titles_per_genre = str_per_genre(grouped_movies_genre)

In [11]:
# save all titles of each genres as text files 

save_path = input("Please enter your directory:\n")

for i, key in enumerate(dict_titles_per_genre):
    name_of_file = 'titles_' + str(i) # dict_titles_per_genre
    completeName = os.path.join(save_path, name_of_file + ".txt")         
    file = open(completeName, "w")
    toFile = dict_titles_per_genre[key]
    file.write(toFile)
    file.close()

Please enter your directory:
C:\Users\mahmo\PycharmProjects\pythonProject\MovieLens
