## Notebook to convert raw training data to clean format

---

In [227]:
import os
import pandas as pd

In [228]:
BASE_DIR = '../data/train/'

#----------------------------------------------

wiki_data = BASE_DIR+'wiki_movie_data.csv'

CMU_data = BASE_DIR+'CMU_movie_metadata.tsv'

kaggle_data = BASE_DIR+'kaggle_movie_data.csv'

CMU_plot_summaries = BASE_DIR+'CMU_plot_summaries.txt'



In [229]:
class ReadData():
    def load_file(self,file_path,file_format):
        if file_format not in ['tsv','csv']:
            raise TypeError("File format not supported in class ReadData. Please use manual loading.")
            
        else:
            if file_format == 'csv':
                return pd.read_csv(file_path)
            else:
                return pd.read_csv(file_path,sep='\t')

        

In [230]:
data_reader = ReadData()

In [231]:
wikipedia_data = data_reader.load_file(file_path=wiki_data,file_format='csv')

In [232]:
wikipedia_data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [233]:
cmu_data = data_reader.load_file(CMU_data,'tsv')

In [234]:
cmu_data.head()

Unnamed: 0,ID,Code,Title,Release Date,5,6,Language,Country,Genre
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [256]:
map_wiki_categories={'drama, horror':'horror',
                     'horror':'horror',
                      'horror, comedy':'horror',
                     'romantic drama':'romance',
                     'romance drama':'romance',
                     'romance/drama':'romance',
                     'drama, romance':'romance',
                     'romantic comedy/drama':'romance',
                     'romance/drama':'romance',
                     'romance/comedy':'romance',
                     'romance':'romance',
                     'romantic comedy':'romance',
                     
                     'sci-fi, horror':'science fiction',
                     'action, sci-fi':'science fiction',
                     'drama, science fiction':'science fiction',
                     'comedy, science fiction':'science fiction',
                     'sci-fi comedy':'science fiction',
                     'sci-fi, comedy':'science fiction',
                     'science fiction':'science fiction',
                     'science-fiction':'science fiction',
                     'horror, science fiction':'science fiction',
                     'sci-fi':'science fiction',
                     'horror, sci-fi':'science fiction',
                     'science fiction, horror':'science fiction',
                     'action, science fiction ':'science fiction',
                     'science fiction, thriller':'science fiction',
                     'tokusatsu, action, sci-fi':'science fiction',
                     'science fiction comedy':'science fiction',
                     'drama, science fiction':'science fiction',
                     
                     'action comedy':'action',
                     'action drama':'action',
                     'action, drama':'action',
                     'short action/crime western':'action',
                     'action adventure':'action',
                     'action thriller':'action',
                     'action masala':'action',
                     'action':'action',
                     
                     'suspense':'suspense',
                     'mystery, thriller':'suspense',
                     'mystery, thriller':'suspense',
                     'mystery, horror':'suspense',
                     'action, thriller':'suspense',
                     'mystery':'suspense',
                     'thriller':'suspense',
                     'drama, mystery':'suspense',
                     'mystery, thriller':'suspense',
                     'drama, mystery':'suspense',
                     'drama, thriller':'suspense',
                     'psychological thriller':'suspense',
                     'horror thriller':'suspense',
                     'crime/thriller':'suspense',
                     
                     'crime':'others',
                     'adventure':'others',
                     'drama, adventure':'others',
                    }

In [257]:
wikipedia_data['Genre']=wikipedia_data['Genre'].map(map_wiki_categories)

In [259]:
selected_data=wikipedia_data[wikipedia_data['Genre'].isin(['romance','action','suspense','horror','science fiction','others'])]

In [260]:
selected_data['Genre'].unique()

array(['action', 'others', 'horror', 'romance', 'suspense',
       'science fiction'], dtype=object)

In [265]:
kaggle_movie_dataset=data_reader.load_file(kaggle_data,'csv')

In [267]:
kaggle_movie_dataset.head()

Unnamed: 0,id,text,genre
0,0,"eady dead, maybe even wishing he was. INT. 2ND...",thriller
1,2,"t, summa cum laude and all. And I'm about to l...",comedy
2,3,"up Come, I have a surprise.... She takes him ...",drama
3,4,ded by the two detectives. INT. JEFF'S APARTME...,thriller
4,5,"nd dismounts, just as the other children reach...",drama
