## Import Necessary Modules

In [1]:
import boto3
import re
import string
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

get_ipython().magic('matplotlib inline')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 100)

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS, tokenization


## Read in Data

In [3]:
# #Print out bucket names
# for bucket in s3.buckets.all():
#     print(bucket.name)

bucketname = 'sagemaker-studio-533437842674-6222pxsry4f'
filename = 'script_EDA_file.csv'

s3 = boto3.resource('s3')
s3.Bucket(bucketname).download_file(filename, 'script_EDA_file.csv')

In [4]:
df = pd.read_csv("script_EDA_file.csv")

In [5]:
#Drop first column
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [6]:
#View data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1141 entries, 0 to 1140
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1141 non-null   object
 1   text       1141 non-null   object
dtypes: object(2)
memory usage: 18.0+ KB


## Data Cleaning

In [7]:
#Remove unneccesary characters from text
df['text'] = df['text'].apply(lambda x: re.sub("[^\w\d' ]", ' ', x))

In [8]:
#Split scripts by words
df['text'] = df['text'].str.split()

In [9]:
#Create Movie title column
df["movie_title"] = df["file_name"].apply(lambda x: x.split(".")[0].replace("_", " "))

In [10]:
#Pull last couple of words from script to extract genre
df['Genres'] = df['text'].apply(lambda x: x[-40:])

In [11]:
# Separate Genre into a separate column

def pull_genre(column):
    newlst = []
    [newlst.append(column[i + 1]) for i, word in enumerate(column) if word == "Genres"]
    return newlst

df['genre'] = df['Genres'].apply(lambda x: pull_genre(x))

df.drop(columns = ['Genres'], axis = 1, inplace = True)

df['genre'] = df['genre'].apply(lambda x: x[0])

In [12]:
#Split Genre into Lists
def split_genres(genres):
    return re.sub( r"([A-Z])", r" \1", genres).split()

df['genre'] = df['genre'].apply(lambda x: split_genres(str(x)))

In [13]:
df.head()

Unnamed: 0,file_name,text,movie_title,genre
0,Beasts_of_No_Nation.txt,"[BEASTS, OF, NO, NATION, Written, by, Cary, Joji, Fukunaga, Based, on, the, novel, by, Uzodinma,...",Beasts of No Nation,"[Drama, War]"
1,Beauty_and_the_Beast.txt,"[BEAUTY, AND, THE, BEAST, Written, by, Stephen, Chbosky, Evan, Spiliotopoulos, Based, on, the, 1...",Beauty and the Beast,"[Family, Fantasy, Musical]"
2,Beavis_and_Butt-head_Do_America.txt,"[Beavis, and, Butt, Head, Do, America, by, Mike, Judge, and, Joe, Stillman, b, b, if, window, to...",Beavis and Butt-head Do America,"[Animation, Comedy]"
3,Beginners.txt,"[BEGINNERS, Written, by, Mike, Mills, INT, HAL, FIELDS, HOME, A, single, daisy, stands, in, a, v...",Beginners,"[Comedy, Drama, Romance]"
4,Being_Human.txt,"[BEING, HUMAN, Written, by, Bill, Forsyth, THIRD, DRAFT, January, 1992, WARNER, BROS, INC, 1992,...",Being Human,"[Comedy, Drama, Fantasy]"


In [None]:
# def freqdist(ex):
#     fdist = nltk.FreqDist(ex)
#     for word in fdist:
#         print(f'{word}: {fdist[word]}')

In [None]:
# df['freqdist'] = df['text'].apply(freqdist)

In [15]:
#binarize genre data
from sklearn.preprocessing import MultiLabelBinarizer

# Binarise labels
mlb = MultiLabelBinarizer()
expandedGenreData = mlb.fit_transform(df["genre"])
labelClasses = mlb.classes_


# Create a pandas.DataFrame from our output
expandedGenres = pd.DataFrame(expandedGenreData, columns=labelClasses)

In [16]:
#concat dataframes back together
df_concat = pd.concat([df, expandedGenres], axis=1)

In [18]:
def remove_caps(text):
    regex = re.compile(r"(\b[A-Z]+\b)")
    caps = list(filter(regex.match, text))
    no_caps = [x for x in text if x not in caps]
    return no_caps

df_concat['text'] = df_concat['text'].apply(lambda x: remove_caps(x))

In [46]:
df_concat.rename(columns={'Film': 'Film-Noir', 'Sci': 'Sci-Fi'}, inplace=True)

In [60]:
df_concat.head()

Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Beasts_of_No_Nation.txt,,Beasts of No Nation,"[Drama, War]",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Beauty_and_the_Beast.txt,,Beauty and the Beast,"[Family, Fantasy, Musical]",0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,Beavis_and_Butt-head_Do_America.txt,,Beavis and Butt-head Do America,"[Animation, Comedy]",0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Beginners.txt,,Beginners,"[Comedy, Drama, Romance]",0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,Being_Human.txt,,Being Human,"[Comedy, Drama, Fantasy]",0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
#Create new csv file with changes
df_concat.to_csv('no_caps_script.csv', index = False)

## Data Cleaning Pt. 2 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
get_ipython().magic('matplotlib inline')

from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df2 = pd.read_csv('no_caps_script.csv')

In [3]:
#Lowercase strings
df2['text'] = df2['text'].str.lower()

In [4]:
df2.head()

Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,Crime,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Beasts_of_No_Nation.txt,"['written', 'by', 'cary', 'joji', 'fukunaga', ...",Beasts of No Nation,"['Drama', 'War']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Beauty_and_the_Beast.txt,"['written', 'by', 'stephen', 'chbosky', 'evan'...",Beauty and the Beast,"['Family', 'Fantasy', 'Musical']",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Beavis_and_Butt-head_Do_America.txt,"['beavis', 'and', 'butt', 'head', 'do', 'ameri...",Beavis and Butt-head Do America,"['Animation', 'Comedy']",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Beginners.txt,"['written', 'by', 'mike', 'mills', 'single', '...",Beginners,"['Comedy', 'Drama', 'Romance']",0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,Being_Human.txt,"['written', 'by', 'bill', 'forsyth', 'january'...",Being Human,"['Comedy', 'Drama', 'Fantasy']",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Delist text column for preprocessing purposes
df2['text'] = df2['text'].apply(eval).apply(' '.join)

In [6]:
df2.head()

Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,Crime,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Beasts_of_No_Nation.txt,written by cary joji fukunaga based on the nov...,Beasts of No Nation,"['Drama', 'War']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Beauty_and_the_Beast.txt,written by stephen chbosky evan spiliotopoulos...,Beauty and the Beast,"['Family', 'Fantasy', 'Musical']",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Beavis_and_Butt-head_Do_America.txt,beavis and butt head do america by mike judge ...,Beavis and Butt-head Do America,"['Animation', 'Comedy']",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Beginners.txt,written by mike mills single daisy stands in a...,Beginners,"['Comedy', 'Drama', 'Romance']",0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,Being_Human.txt,written by bill forsyth january 1992 1992 4000...,Being Human,"['Comedy', 'Drama', 'Fantasy']",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Removing Stop Words

In [7]:
## setting stopwords and punctuations
import string, re

stop_words_list = stopwords.words('english')
stop_words_list += list(string.punctuation)

stop_words_set = set(stop_words_list)

In [8]:
# we can define a function that removes stopwords 
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words_set]
    return stopwords_removed 

In [9]:
# applying the above function to our text column
df2['text'] = df2['text'].apply(lambda x: remove_stopwords(x))

In [10]:
df2.head()

Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,Crime,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Beasts_of_No_Nation.txt,"[written, cary, joji, fukunaga, based, novel, ...",Beasts of No Nation,"['Drama', 'War']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Beauty_and_the_Beast.txt,"[written, stephen, chbosky, evan, spiliotopoul...",Beauty and the Beast,"['Family', 'Fantasy', 'Musical']",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Beavis_and_Butt-head_Do_America.txt,"[beavis, butt, head, america, mike, judge, joe...",Beavis and Butt-head Do America,"['Animation', 'Comedy']",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Beginners.txt,"[written, mike, mills, single, daisy, stands, ...",Beginners,"['Comedy', 'Drama', 'Romance']",0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,Being_Human.txt,"[written, bill, forsyth, january, 1992, 1992, ...",Being Human,"['Comedy', 'Drama', 'Fantasy']",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df2.to_csv("no_stop_words.csv", index = False)

### Stemming

In [24]:
df3 = pd.read_csv("no_stop_words.csv")

In [25]:
df3.head()

Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,Crime,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Beasts_of_No_Nation.txt,"['written', 'cary', 'joji', 'fukunaga', 'based...",Beasts of No Nation,"['Drama', 'War']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Beauty_and_the_Beast.txt,"['written', 'stephen', 'chbosky', 'evan', 'spi...",Beauty and the Beast,"['Family', 'Fantasy', 'Musical']",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Beavis_and_Butt-head_Do_America.txt,"['beavis', 'butt', 'head', 'america', 'mike', ...",Beavis and Butt-head Do America,"['Animation', 'Comedy']",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Beginners.txt,"['written', 'mike', 'mills', 'single', 'daisy'...",Beginners,"['Comedy', 'Drama', 'Romance']",0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,Being_Human.txt,"['written', 'bill', 'forsyth', 'january', '199...",Being Human,"['Comedy', 'Drama', 'Fantasy']",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# portstem = PorterStemmer()

In [None]:
# stemmed_review=[]
# for w in filtered_review:
#     stemmed_review.append(portstem.stem(w))

# print(stemmed_review)

In [None]:
# # creating a list with all lemmatized outputs
# stemmed_output = []

# for listy in processed_data:
#     stemmed = ' '.join([portstem.stem(w) for w in listy])
#     stemmed_output.append(lemmed)

### Lemming

In [13]:
lemmatizer = WordNetLemmatizer() 

In [15]:
# # we can also lemmatize our original reviews
# lemmatized_review=[]
# for w in filtered_review:
#     lemmatized_review.append(lemmatizer.lemmatize(w))

# print(lemmatized_review)

In [18]:
# creating a list with all lemmatized outputs
# lemmatized_output = []

def lemmy(data):
    return [lemmatizer.lemmatize(w) for w in data]  ##Notice the use of text.
#     for listy in data:
#         lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
#         lemmatized_output.append(lemmed)
#     return lemmatized_output


In [19]:
df3['text'] = df3['text'].apply(lemmy)

In [20]:
df3.head()

Unnamed: 0.1,Unnamed: 0,file_name,text,movie_title,genre,Action,Adventure,Animation,Biography,Comedy,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,0,Beasts_of_No_Nation.txt,"[[, ', w, r, i, t, t, e, n, ', ,, , ', c, a, ...",Beasts of No Nation,"['Drama', 'War']",0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,Beauty_and_the_Beast.txt,"[[, ', w, r, i, t, t, e, n, ', ,, , ', s, t, ...",Beauty and the Beast,"['Family', 'Fantasy', 'Musical']",0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2,Beavis_and_Butt-head_Do_America.txt,"[[, ', b, e, a, v, i, s, ', ,, , ', b, u, t, ...",Beavis and Butt-head Do America,"['Animation', 'Comedy']",0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,Beginners.txt,"[[, ', w, r, i, t, t, e, n, ', ,, , ', m, i, ...",Beginners,"['Comedy', 'Drama', 'Romance']",0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,4,Being_Human.txt,"[[, ', w, r, i, t, t, e, n, ', ,, , ', b, i, ...",Being Human,"['Comedy', 'Drama', 'Fantasy']",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Visualizing Data

In [None]:
#Barplot for amount of scripts in each genre
categories = list(df3.columns[4:].values)
sns.set(font_scale = 2, style = 'darkgrid')
plt.figure(figsize=(15,8))
ax= sns.barplot(categories, df3.iloc[:,4:].sum().values)
plt.title("Scripts in each genre", fontsize=24)
plt.ylabel('Number of scripts', fontsize=16)
plt.xlabel('Genre', fontsize=16)

ax.set_xticklabels(ax.get_xticklabels(), 
                          rotation=90, 
                          horizontalalignment='center')

for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                size = 15,
                xytext = (0, 9),
                textcoords = 'offset points')

plt.tight_layout()
plt.show()

In [None]:
#Barplot for scripts that have multiple genres
rowSums = df3.iloc[:,2:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)
plt.title("Scripts with multiple genres")
plt.ylabel('Number of scripts', fontsize=16)
plt.xlabel('Number of genres', fontsize=16)
#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, 
            label, ha='center', va='bottom', size=15)
plt.show()

In [None]:
# df_concat.head()

In [None]:
# from wordcloud import WordCloud,STOPWORDS

# plt.figure(figsize=(40,25))

# # drama
# subset = df_concat[df_concat.Drama==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_drama = WordCloud(stopwords=STOPWORDS,
#                         background_color='white',
#                         collocations=False,
#                         max_words=50,
#                         width=2500,
#                         height=1800).generate(str(text))

# plt.subplot(2, 3, 1)
# plt.axis('off')
# plt.title("Drama",fontsize=40)
# plt.imshow(cloud_drama)


# # severe_comedy
# subset = df_concat[df_concat.Comedy==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_comedy = WordCloud(stopwords=STOPWORDS,
#                          background_color='white',
#                          collocations=False,
#                          max_words=50,
#                          width=2500,
#                          height=1800).generate(str(text))

# plt.subplot(2, 3, 2)
# plt.axis('off')
# plt.title("Comedy",fontsize=40)
# plt.imshow(cloud_comedy)



# # Thriller
# subset = df_concat[df_concat.Thriller==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_thriller = WordCloud(stopwords=STOPWORDS,
#                            background_color='white',
#                            collocations=False,
#                            max_words=50,
#                            width=2500,
#                            height=1800).generate(str(text))

# plt.subplot(2, 3, 3)
# plt.axis('off')
# plt.title("Thriller",fontsize=40)
# plt.imshow(cloud_thriller)


# # action
# subset = df_concat[df_concat.Action==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_action = WordCloud(stopwords=STOPWORDS,
#                          background_color='white',
#                          collocations=False,
#                          max_words=50,
#                          width=2500,
#                          height=1800).generate(str(text))

# plt.subplot(2, 3, 4)
# plt.axis('off')
# plt.title("Action",fontsize=40)
# plt.imshow(cloud_action)


# # crime
# subset = df_concat[df_concat.Crime==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_crime = WordCloud(stopwords=STOPWORDS,
#                         background_color='white',
#                         collocations=False,
#                         max_words=50,
#                         width=2500,
#                         height=1800).generate(str(text))

# plt.subplot(2, 3, 5)
# plt.axis('off')
# plt.title("Crime",fontsize=40)
# plt.imshow(cloud_crime)


# # romance
# subset = df_concat[df_concat.Romance==1]
# text = [", ".join(line) for line in subset.text.values]
# cloud_romance = WordCloud(stopwords=STOPWORDS,
#                           background_color='white',
#                           collocations=False,
#                           max_words=50,
#                           width=2500,
#                           height=1800).generate(str(text))

# plt.subplot(2, 3, 6)
# plt.axis('off')
# plt.title("Romance",fontsize=40)
# plt.imshow(cloud_romance)

# plt.show()

## Baseline Model