In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("wiki_movie_plots_deduped-original_dataset.csv")

In [4]:
plot_len = df.Plot.apply(lambda x: len(x))
plot_len.describe()

count    34886.000000
mean      2165.034541
std       1817.325247
min         15.000000
25%        716.000000
50%       1656.000000
75%       3376.000000
max      36773.000000
Name: Plot, dtype: float64

In [5]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [6]:
c = df.groupby("Genre").count()
c.sort_values(by="Title", axis=0, ascending=False).head(20)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
unknown,6083,6083,6083,6083,5400,6083,6083
drama,5964,5964,5964,5964,5841,5964,5964
comedy,4379,4379,4379,4379,4347,4379,4379
horror,1167,1167,1167,1167,1124,1167,1167
action,1098,1098,1098,1098,1087,1098,1098
thriller,966,966,966,966,955,966,966
romance,923,923,923,923,918,923,923
western,865,865,865,865,864,865,865
crime,568,568,568,568,565,568,568
adventure,526,526,526,526,524,526,526


In [9]:
df.groupby("Origin/Ethnicity").count().sort_values(by="Title", axis=0, ascending=False)

Unnamed: 0_level_0,Release Year,Title,Director,Cast,Genre,Wiki Page,Plot
Origin/Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
American,17377,17377,17377,17060,17377,17377,17377
British,3670,3670,3670,3636,3670,3670,3670
Bollywood,2931,2931,2931,2849,2931,2931,2931
Tamil,2599,2599,2599,2559,2599,2599,2599
Telugu,1311,1311,1311,1278,1311,1311,1311
Japanese,1188,1188,1188,822,1188,1188,1188
Malayalam,1095,1095,1095,1065,1095,1095,1095
Hong Kong,791,791,791,648,791,791,791
Canadian,723,723,723,658,723,723,723
Australian,576,576,576,466,576,576,576


---------------------------------------------------

## Preprocessing

In [7]:
df_proc = pd.DataFrame(df,columns=["Genre", "Plot"], copy=True)
df_proc.sample(3)

Unnamed: 0,Genre,Plot
17998,action,"This is the story of Sohana, a dancer who is h..."
23753,romantic comedy,Fifty-two year old Leung Foon (Lawrence Cheng)...
28979,unknown,Vedhala Ulagam is a world of demons where mort...


In [8]:
df_proc["dummy"] = 1
len(df_proc['Genre'].unique())

2265

In [9]:
c = df_proc.groupby("Genre").count()
c.sort_values(by="dummy", axis=0, ascending=False).head(30)

Unnamed: 0_level_0,Plot,dummy
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
unknown,6083,6083
drama,5964,5964
comedy,4379,4379
horror,1167,1167
action,1098,1098
thriller,966,966
romance,923,923
western,865,865
crime,568,568
adventure,526,526


In [13]:
def genre_cleaning(row):
    fix_map = {
        'bio-pic' : 'biography',
        'biopic' : 'biography',
        'biographical' : 'biography',
        'biodrama' : 'biography',
        'bio-drama' : 'biography',
        'biographic' : 'biography',
        'biography about montreal canadiens star|maurice richard' : 'biography',
        'biography fim' : 'biography',
        'biography of pioneering american photographer eadweard muybridge' : 'biography',

        'animated' : 'animation',
        'anime' : 'animation',
        '3-d' : 'animation',
        '3d' : 'animation',
        'animationchildren' : 'animation',
        'computer animation' : 'animation',
        ' in animation' : 'animation',
        

        
        'children\'s' : 'social',
        'family' : 'social',
        
        'comedey' : 'comedy',
        'comedy 2-reeler' : 'comedy',

        'historical' : 'history',
        '\|docudrama' : 'history',
        'history dram' : 'history',
        
        'romantic' : 'romance',
        'romantic comedy' : 'romance',
        'romantic drama' : 'romance',
        
        
        'sci-fi' : 'science_fiction',
        'sci fi' : 'science_fiction',
        'science fiction' : 'science_fiction',
        
        'ttriller' : 'thriller',
        'psycho thriller' : 'thriller',

        
        'adventures' : 'adventure',
        
        'kung fu' : 'action',
        'kung-fu' : 'action',
        'martial arts' : 'action',
        'bruceploitation' : 'action',
        'gun fu' : 'action',
        'martial art' : 'action',
        'superhero' : 'action',


        
        'world war ii' : 'war',
        'world war i' : 'war',
        'war-time' : 'war',
        'wartime' : 'war',
        'ww1' : 'war',
        'wwii' : 'war',
        'afghan war drama' : 'war',
        'war drama' : 'war',

        
        'drama about child soldiers' : 'drama',
        'drama loosely' : 'drama',
        'drama film' : 'drama',
        
        'spy film' : 'spy',
        '\|007' : 'spy',

        
        'avant-garde' : 'avant_garde',
        
        'comedy-drama adaptation of the mordecai richler novel' : 'comedy-drama',
        'comedy, drama' : 'comedy-drama',
        'comedy drama' : 'comedy-drama',
        
        'musical b' : 'musical',
        'music' : "musical",
        'musical comedy' : 'musical',

        
        
        'psychological' : 'psycho',
        
        'rom-coms' : 'romance',
        
        'true crime' : 'crime',
        'crime drama' : 'crime',
        
        
        'j-horror' : 'horror',
        
        ' \(aquatics|swimming\)' : 'sports',
        ' \(shogi|chess\)' : 'sports',
        ' (road bicycle racing)' : 'sports',
        'american football' : 'sports',
        'dev\|nusrat jahan' : 'sports',
        ' \(road bicycle racing\)' : 'sports',

        
        'tv miniseries' : 'series',
        'television miniseries' : 'series',
        'serial' : 'series',
        

        'ero' : "adult",
        'erotica' : "adult",
        'erotic' : "adult",
        
        
    }
    current = row["Genre"]
    fixed = fix_map.get(current, current) 
    if("anim" in current):
        fixed = "animation"
    if("thriller" in current):
        fixed = "thriller"
    if("fantas" in current):
        if("science" in current):
            fixed = "science_fiction"
        else:
            fixed = 'fantasy'
    return fixed

In [14]:
df_proc["genre_fixed"] = df_proc.apply(genre_cleaning, axis=1)

In [15]:
df_proc.head()

Unnamed: 0,Genre,Plot,dummy,genre_fixed
0,unknown,"A bartender is working at a saloon, serving dr...",1,unknown
1,unknown,"The moon, painted with a smiling face hangs ov...",1,unknown
2,unknown,"The film, just over a minute long, is composed...",1,unknown
3,unknown,Lasting just 61 seconds and consisting of two ...,1,unknown
4,unknown,The earliest known adaptation of the classic f...,1,unknown


In [16]:
c = df_proc.groupby("genre_fixed").count()
sort_genre = c.sort_values(by="dummy", axis=0, ascending=False)
sort_genre[1:15]

Unnamed: 0_level_0,Genre,Plot,dummy
genre_fixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
drama,5970,5970,5970
comedy,4381,4381,4381
thriller,1757,1757,1757
romance,1494,1494,1494
action,1202,1202,1202
horror,1169,1169,1169
crime,1035,1035,1035
animation,872,872,872
western,865,865,865
science_fiction,667,667,667


In [17]:
len(df_proc['genre_fixed'].unique())

1643

In [18]:
remaining_genre = list(sort_genre[:15].index.values)
remaining_genre.remove('unknown')
remaining_genre

['drama',
 'comedy',
 'thriller',
 'romance',
 'action',
 'horror',
 'crime',
 'animation',
 'western',
 'science_fiction',
 'musical',
 'adventure',
 'fantasy',
 'comedy-drama']

# Process remaining genres plots

In [20]:
df_proc_2 = pd.DataFrame(df_proc, copy=True, columns=['genre_fixed', 'Plot'])

In [21]:
df_proc_2 = df_proc_2[df_proc_2['genre_fixed'].apply(lambda x: x in remaining_genre)]

In [22]:
#df_proc_2.to_csv('reduced_dataset', index_label='id')

In [23]:
df_proc_2.shape

(21561, 2)

In [24]:
df_proc_2.head()

Unnamed: 0,genre_fixed,Plot
6,western,The film opens with two bandits breaking into ...
7,comedy,The film is about a family who move to the sub...
14,comedy,Before heading out to a baseball game at a nea...
15,comedy,The plot is that of a black woman going to the...
16,drama,On a beautiful summer day a father and mother ...


In [25]:
df_proc_2 = pd.read_csv('reduced_dataset', index_col='id')

In [26]:
df_proc_2.head()

Unnamed: 0_level_0,genre_fixed,Plot
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,western,The film opens with two bandits breaking into ...
7,comedy,The film is about a family who move to the sub...
14,comedy,Before heading out to a baseball game at a nea...
15,comedy,The plot is that of a black woman going to the...
16,drama,On a beautiful summer day a father and mother ...


## Text processing

In [27]:
import nltk 
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
def normalize(plot):
    plot = plot.lower()
    plot = re.sub(r"what's", "what is ", plot)
    plot = re.sub(r"\'s", " ", plot)
    plot = re.sub(r"\'ve", " have ", plot)
    plot = re.sub(r"can't", "can not ", plot)
    plot = re.sub(r"n't", " not ", plot)
    plot = re.sub(r"i'm", "i am ", plot)
    plot = re.sub(r"\'re", " are ", plot)
    plot = re.sub(r"\'ll", " will ", plot)
    plot = re.sub(r"\'scuse", " excuse ", plot)
    plot = re.sub(r"\[[0-9]+\]", "", plot)
    plot = plot.strip(' ')
    return plot

In [29]:
df_proc_2['plot_processed'] = df_proc_2['Plot'].apply(normalize)

In [30]:
df_proc_2.shape

(21561, 3)

In [31]:
vectorizer = CountVectorizer(stop_words='english')

In [32]:
corpus = df_proc_2.plot_processed.values
print(corpus[0])

the film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive  tender at the station  water tank. they then knock the operator out and tie him up. as the train stops it is boarded by the bandits‍—‌now four. two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. the bandits then force the passengers off the train and rifle them for their belongings. one passenger tries to escape but is instantly shot down. carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.
meanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. his daughter arrives bringing him his meal and cuts him free, and restores him to consciousness

## Subsamples of size 300 from each category. 

In [33]:
df_proc_2.head()

Unnamed: 0_level_0,genre_fixed,Plot,plot_processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,western,The film opens with two bandits breaking into ...,the film opens with two bandits breaking into ...
7,comedy,The film is about a family who move to the sub...,the film is about a family who move to the sub...
14,comedy,Before heading out to a baseball game at a nea...,before heading out to a baseball game at a nea...
15,comedy,The plot is that of a black woman going to the...,the plot is that of a black woman going to the...
16,drama,On a beautiful summer day a father and mother ...,on a beautiful summer day a father and mother ...


In [34]:
df_proc_2.drop(columns=['Plot'], inplace=True)

In [35]:
dfs = [df_proc_2[df_proc_2['genre_fixed'] == genre].sample(300, random_state=42) for genre in remaining_genre]

In [36]:
df_sample_300 = pd.concat(dfs)

In [37]:
df_sample_300.head(5)

Unnamed: 0_level_0,genre_fixed,plot_processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
18126,drama,"in pre-world war i london, handsome young avia..."
2892,drama,a woman (carole landis) and a u.s. captain (he...
7034,drama,duke and freddie are two friends who steal car...
32405,drama,during a walk to the park raghuraman (prakash ...
22419,drama,werewolf centres on blaise (andrew gillis) and...


In [38]:
corpus = df_sample_300.plot_processed.values
print(corpus[0])

in pre-world war i london, handsome young aviator alex st. george (stuart) meets and falls in love with shopgirl kitty greenwood (brody). he asks her to marry him, to the horror of his snobbish, class-bound mother (dorothy cumming), who is appalled by the notion of her son marrying into a family who run a tobacconists shop. before the wedding can take place, war breaks out and alex is called up to serve as a pilot.
seeing her opportunity to sabotage the relationship, mrs. st. george sets about trying to poison alex  mind against kitty by feeding him via letter a string of malicious and false tales about kitty  behaviour, alleging that in his absence she is frequently to be seen around town flirting and behaving in an improper manner with other young men. alex becomes so unnerved and distraught about his mother  stories that his concentration is affected and he crashes his plane, suffering not only critical injuries which leave him in danger of paralysis, but also amnesia.
alex is r

In [39]:
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
import pickle

In [41]:
pickle.dump(vectorizer, open("vectorizer.pkl","wb"))

In [42]:
sparse_bow = vectorizer.transform(corpus)

In [43]:
bow = sparse_bow.toarray()

In [44]:
bow.shape

(4200, 50005)

In [46]:
df_bow = pd.DataFrame(bow, index = df_sample_300.index, columns=vectorizer.get_feature_names())

In [47]:
df_bow['genre_of_the_film'] = df_sample_300['genre_fixed']

In [48]:
%%time
#df_bow.to_csv("genre_300_sample_plot.csv", index_label='id_of_the_film')

Wall time: 5min 20s
