In [234]:
import pandas as pd
import random
import numpy as np
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
import unicodedata
import inflect
import re
from itertools import chain
#!pip install contractions
import contractions
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/mina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [247]:
#Loading and Cleaning Data

#load in the csv contatining the data
df_original = pd.read_csv('wiki_movie_plots_deduped.csv')

In [248]:
#drop all rows in the dataframe that do not contain horror(1167) in genre catergory
df_horror = df_original.drop(df_original[df_original['Genre'] != 'horror'].index,inplace=False)
#drop columns we wont need
df_horror.drop(['Release Year','Origin/Ethnicity','Cast', 'Wiki Page','Director','Release Year'],axis=1, 
               inplace = True)

#reduce horror to 1000 rows
df_horror = df_horror.sample(frac=1)
df_horror = df_horror.reset_index(drop=True)
df_horror_train = df_horror[0:1000]
df_horror_test = df_horror[1000:len(df_horror)]

In [249]:
#drop all rows in dataframe that do not contain comedy(4379)
df_comedy = df_original.drop(df_original[df_original['Genre'] != 'comedy'].index, inplace = False)
df_comedy.drop(['Release Year','Origin/Ethnicity','Cast', 'Wiki Page','Director','Release Year'],axis=1, 
               inplace = True)
#reduce horror to 1000 rows
df_comedy = df_comedy.sample(frac=1)
df_comedy = df_comedy.reset_index(drop=True)
df_comedy_train = df_comedy[0:1000]
df_comedy_test = df_comedy[1000:len(df_comedy)]

In [250]:
#merge the comedy and horror dataframes into one
df_HandC = pd.concat([df_horror_test,df_comedy_test])

#Pre punctuation parsing test
#df_HandC.iloc[1002]['Plot']

In [251]:
# Check word type of a word
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [256]:
#Function for text pre-processing
def normalizeText(row):
    
    #create a list of punctuations we wish to delete
    punctuations = string.punctuation
    #create a list of stopwords in English
    stopWords = set(stopwords.words('english'))
    
    #change all letters to lower case
    row = row.lower()  
    
    #remove numbers
    row = re.sub(" \d+", " ", row)
                 
    #remove punctutation
    for letter in row: 
    
        if letter in punctuations: 
            row = row.replace(letter, "")
    
    #expand the contraction I'm -> I am
    row = contractions.fix(row)
    # got it from (https://github.com/kootenpv/contractions)
                
    #remove accent char
    row = unicodedata.normalize('NFKD', row).encode('ascii', 'ignore').decode('utf-8', 'ignore')
                 
    #remove extra whitespace convert into a word  
    row = row.strip()
    
    # TOKENIZATION: process of splitting text into smaller piece called tokens.
    tokens = word_tokenize(row)
    
    # lemmatization step played -> play
    lemmatizer = WordNetLemmatizer()
    row = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(row)])
    
    #remove stop words such as "a", "the", "is"
    tokens = word_tokenize(row)
    row = ' '.join([i for i in tokens if not i in stopWords])
     
    return row

In [257]:
#run the column through the normalizing function
df_HandC["Plot"] = df_HandC["Plot"].apply(normalizeText) 

In [258]:
# Make list from df for "Plot" column
# Param: the df to use
def dfToList(df):
    lst = []
    for i, rows in df.iterrows():
        l = [rows.Plot]
        lst.append(l)
    return list(chain.from_iterable(lst))

# tf-idf vectorisation
# Param: all plots from the df as a list (ie [plot1, plot2,...])
def tfidfVec(plots):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(plots)
    feature_names = vectorizer.get_feature_names()
    dense = tfidf.todense()
    df_tfidf = pd.DataFrame(dense.tolist(), columns=feature_names)
    return df_tfidf

In [259]:
# test showing how it works on first 5 elements of df_HandC
tfidfVec(dfToList(df_HandC.head()))


Unnamed: 0,abandon,abomination,accidentally,accord,action,actually,ad,addiction,adhyayan,admits,...,wood,worry,wound,wrist,write,yash,year,york,young,youngadult
0,0.0,0.0,0.014571,0.0,0.021758,0.043516,0.0,0.0,0.021758,0.021758,...,0.0,0.0,0.017554,0.043516,0.070216,0.282851,0.012258,0.0,0.0,0.0
1,0.0,0.090579,0.0,0.022645,0.0,0.0,0.0,0.022645,0.0,0.0,...,0.091348,0.0,0.091348,0.0,0.01827,0.0,0.012758,0.0,0.022645,0.0
2,0.027365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.019109,0.0,0.0,0.0
3,0.024539,0.0,0.02037,0.0,0.0,0.0,0.030415,0.0,0.0,0.0,...,0.049078,0.0,0.0,0.0,0.0,0.0,0.017135,0.0,0.0,0.0
4,0.0,0.0,0.021862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.032643,0.0,0.0,0.0,0.0,0.0,0.065287,0.0,0.032643
