## Notebook to convert raw training data to clean format

---

In [1]:
import os
import pandas as pd
import csv
from tqdm import tqdm
import json

In [2]:
BASE_DIR = '../data/train/'

#----------------------------------------------

wiki_data = BASE_DIR+'wiki_movie_data.csv'

CMU_data = BASE_DIR+'CMU_movie_metadata.tsv'

kaggle_data = BASE_DIR+'kaggle_movie_data.csv'

CMU_plot_summaries = BASE_DIR+'CMU_plot_summaries.txt'



In [3]:
class ReadData():
    def load_file(self,file_path,file_format):
        if file_format not in ['tsv','csv']:
            raise TypeError("File format not supported in class ReadData. Please use manual loading.")
            
        else:
            if file_format == 'csv':
                return pd.read_csv(file_path)
            else:
                return pd.read_csv(file_path,sep='\t')

        

In [4]:
def convert_txt_to_csv(txt_filename):
    data = []
    with open(txt_filename, 'r') as f:
        reader = csv.reader(f, dialect='excel-tab') 
        for row in tqdm(reader):
            data.append(row)
    movie_id = []
    plot = []

    # extract movie Ids and plot summaries
    for i in tqdm(data):
        movie_id.append(i[0])
        plot.append(i[1])

    # create dataframe
    movies = pd.DataFrame({'ID': movie_id, 'Plot': plot})
    movies.to_csv(BASE_DIR+'CMU_plot.csv',index=False)

In [5]:
data_reader = ReadData()

In [6]:
wikipedia_data = data_reader.load_file(file_path=wiki_data,file_format='csv')

In [7]:
wikipedia_data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [8]:
map_wiki_categories={'drama, horror':'horror',
                     'horror':'horror',
                      'horror, comedy':'horror',
                     'romantic drama':'romance',
                     'romance drama':'romance',
                     'romance/drama':'romance',
                     'drama, romance':'romance',
                     'romantic comedy/drama':'romance',
                     'romance/drama':'romance',
                     'romance/comedy':'romance',
                     'romance':'romance',
                     'romantic comedy':'romance',
                     
                     'sci-fi, horror':'science fiction',
                     'action, sci-fi':'science fiction',
                     'drama, science fiction':'science fiction',
                     'comedy, science fiction':'science fiction',
                     'sci-fi comedy':'science fiction',
                     'sci-fi, comedy':'science fiction',
                     'science fiction':'science fiction',
                     'science-fiction':'science fiction',
                     'horror, science fiction':'science fiction',
                     'sci-fi':'science fiction',
                     'horror, sci-fi':'science fiction',
                     'science fiction, horror':'science fiction',
                     'action, science fiction ':'science fiction',
                     'science fiction, thriller':'science fiction',
                     'tokusatsu, action, sci-fi':'science fiction',
                     'science fiction comedy':'science fiction',
                     'drama, science fiction':'science fiction',
                     
                     'action comedy':'action',
                     'action drama':'action',
                     'action, drama':'action',
                     'short action/crime western':'action',
                     'action adventure':'action',
                     'action thriller':'action',
                     'action masala':'action',
                     'action':'action',
                     
                     'suspense':'suspense',
                     'mystery, thriller':'suspense',
                     'mystery, thriller':'suspense',
                     'mystery, horror':'suspense',
                     'action, thriller':'suspense',
                     'mystery':'suspense',
                     'thriller':'suspense',
                     'drama, mystery':'suspense',
                     'mystery, thriller':'suspense',
                     'drama, mystery':'suspense',
                     'drama, thriller':'suspense',
                     'psychological thriller':'suspense',
                     'horror thriller':'suspense',
                     'crime/thriller':'suspense',
                     
                     'crime':'others',
                     'adventure':'others',
                     'drama, adventure':'others',
                    }

In [9]:
wikipedia_data['Genre']=wikipedia_data['Genre'].map(map_wiki_categories)

In [10]:
wiki_selected_data=wikipedia_data[wikipedia_data['Genre'].isin(['romance','action','suspense','horror','science fiction','others'])]

In [11]:
wiki_selected_data['Genre'].unique()

array(['action', 'others', 'horror', 'romance', 'suspense',
       'science fiction'], dtype=object)

In [12]:
wiki_selected_data=wiki_selected_data[['Release Year','Title','Cast','Plot','Genre']].reset_index(drop=True)

In [13]:
wiki_selected_data.head()

Unnamed: 0,Release Year,Title,Cast,Plot,Genre
0,1906,From Leadville to Aspen: A Hold-Up in the Rockies,,The film features a train traveling through th...,action
1,1908,The Call of the Wild,Charles Inslee,A white girl (Florence Lawrence) rejects a pro...,others
2,1912,Dr. Jekyll and Mr. Hyde,James Cruze,White-haired Dr. Jekyll has secretly locked hi...,horror
3,1913,Dr. Jekyll and Mr. Hyde,King Baggot,Dr. Henry Jekyll (King Baggot) sends a note to...,horror
4,1913,The Evidence of the Film,"William Garwood, Marie Eline",The Evidence of the Film tells the story of a ...,others


In [14]:
convert_txt_to_csv(CMU_plot_summaries)

42303it [00:01, 30363.67it/s]
100%|██████████| 42303/42303 [00:00<00:00, 1043249.15it/s]


In [17]:
cmu_plot_data=data_reader.load_file(BASE_DIR + 'CMU_plot.csv','csv')

In [19]:
cmu_movie_data=data_reader.load_file(CMU_data,'tsv')[['ID', 'Title', 'Genre','Release Year']]

In [20]:
cmu_data = cmu_movie_data.merge(cmu_plot_data,on = 'ID',how='left')

In [21]:
cmu_data=cmu_data[cmu_data['Plot'].notnull()]

In [22]:
cmu_selected_data=cmu_data[(~cmu_data.Title.isin(wikipedia_data.Title))]

In [23]:
cmu_selected_data.head()

Unnamed: 0,ID,Title,Genre,Release Year,Plot
4,261236,A Woman in Flames,"{""/m/07s9rl0"": ""Drama""}",1983.0,"Eva, an upper class housewife, becomes frustra..."
12,6631279,Little city,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...",1997.0,"Adam, a San Francisco-based artist who works a..."
14,18296435,Aaah Belinda,"{""/m/01z4y"": ""Comedy""}",1986.0,"Serap, a young actress with a strong, lively p..."
15,11250635,The Mechanical Monsters,"{""/m/06n90"": ""Science Fiction"", ""/m/03k9fj"": ""...",,The story starts as one of the robots flies i...
18,32456683,Die Fahne von Kriwoj Rog,{},1967.0,"Otto Brosowski, a communist miner, writes to t..."


In [24]:
genres = [] 
# extract genres
for i in cmu_selected_data['Genre']: 
    genres.append(list(json.loads(i).values())) 

# add to 'movies' dataframe  
cmu_selected_data['Genre'] = genres
cmu_selected_data['Genre'] = cmu_selected_data['Genre'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
cmu_mapping={"['Action']":"action",
             "['Thriller']":"suspense",
            "['Science Fiction']":"science fiction",
            "['Science Fiction', 'Horror']":"science fiction",
            "['Science Fiction', 'Action']":"science fiction",
            "['Thriller', 'Science Fiction', 'Horror']":"science fiction",
            "['Science Fiction', 'Drama']":"science fiction",
            "['Science Fiction', 'Comedy']":"science fiction",
            "['Thriller', 'Science Fiction', 'Action']":"science fiction",
            "['Science Fiction', 'Adventure']":"science fiction"}

In [26]:
cmu_selected_data['Genre']=cmu_selected_data['Genre'].map(cmu_mapping)
cmu_selected_data=cmu_selected_data[cmu_selected_data['Genre'].isin(['romance','action','suspense','horror','science fiction','others'])].reset_index(drop=True)
cmu_selected_data=cmu_selected_data.drop('ID',axis=1)
cmu_selected_data['Cast']=None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [27]:
cmu_selected_data.head()

Unnamed: 0,Title,Genre,Release Year,Plot,Cast
0,Vandanam,action,1989,"Professor Kurian Fernandez , a convict escapes...",
1,Alien Express,science fiction,2005-08-13,A new bullet train is speeding passengers to L...,
2,The Weapon,suspense,,"Lizabeth Scott plays Elsa Jenner, widowed moth...",
3,The Hanged Man,suspense,,Seven irrevocably damaged social misfits come ...,
4,Robot Taekwon V,action,1976-07-24,"Dr. Kaff , an evil scientist bent on world dom...",


In [28]:
final_data=wiki_selected_data.append(cmu_selected_data)

In [30]:
final_data['Genre'].value_counts()

romance            1681
suspense           1641
action             1446
horror             1194
others             1133
science fiction     884
Name: Genre, dtype: int64

In [31]:
final_data.shape

(7979, 5)