<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Read-the-script" data-toc-modified-id="Read-the-script-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read the script</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Remove-things-from-original-lines" data-toc-modified-id="Remove-things-from-original-lines-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Remove things from original lines</a></span></li><li><span><a href="#Word-Preprocessing" data-toc-modified-id="Word-Preprocessing-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Word Preprocessing</a></span></li><li><span><a href="#Swear-Words-Analysis" data-toc-modified-id="Swear-Words-Analysis-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Swear Words Analysis</a></span></li></ul></li></ul></div>

# Swear Words Detection in Scripts

This notebook applies some simple text mining techniques on movie scripts to extract swear words.

## Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
from helpers import data_load
import plotly.express as px

# Offline mode
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\strat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Process the PDF script

The script below goes through the following steps:

- Reads in script PDF for each movie
- Removes unnecesary text
- Extracts all words and removes punctuation and stopwords
- Identifies swear words
- Combines all movies together

**Please note:** The scripts for each movie were just downloaded by a simple google search "<Movie Name> movie script PDF". Then save the script on *data/pdf_scripts*.

In [3]:
# define pdf and output folders
pdf_folder = "data/pdf_scripts"
txt_folder = "data/txt_scripts"
df_list = []

# define the pdf
for fn in os.listdir(pdf_folder):

    # check if it's a pdf
    if fn.endswith(".pdf"):
        
        # process the file
        data_load.read_script(fn)

        # get the txt filename
        txt_fn = fn.replace(".pdf", ".txt")

        txt_script = data_load.pre_process_script(txt_path=txt_fn)

        # extract the words dataframe
        movie_name = txt_fn.split(".txt")[0]
        df_words = data_load.get_words_df(txt_script, movie_name=movie_name)

        df_list.append(df_words)


Loaded Good Will Hunting.pdf
Total pages: 122
Succesfully output script in: data/txt_scripts\Good Will Hunting.txt
Total Swear Words : 142
Loaded Goodfellas.pdf
Total pages: 107
Succesfully output script in: data/txt_scripts\Goodfellas.txt
Total Swear Words : 114
Loaded Pulp Fiction.pdf
Total pages: 126
Succesfully output script in: data/txt_scripts\Pulp Fiction.txt
Total Swear Words : 290
Loaded The Wolf of Wall Street.pdf
Total pages: 138
Succesfully output script in: data/txt_scripts\The Wolf of Wall Street.txt
Total Swear Words : 260


## Create the Visual

This section plots the cumulative swear count for each movie as the movie script progresses.

In [7]:
# combine all dataframes
df_comb = pd.concat(df_list).reset_index(drop=True)

# create the chart
fig = px.line(df_comb, x="movie_progress", y="cum_swear",
        color="movie_name",
        title="Cumulative Swear Word Occurrence",
        template="ggplot2",
        height=600,
        width=1200)

# update the layout and axes
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    title=None
))

fig.update_xaxes(title="Movie Progress",
                 tickformat="%")
fig.update_yaxes(title="Swear Word Count")


# create the imgs folder
if not os.path.exists("imgs"):
    os.mkdir("imgs")

# show the figure and save
iplot(fig)
# fig.write_image("imgs/fig.png")

# Appendix

This section has some old code and a few experimental cells.

## Read the script

In [102]:

# read the script
txt_fn = "Pulp_Fiction.txt"
# filename = "forrest_gump.txt"
# filename = "harry_potter_1st.txt"
# filename = "pulp_fiction.txt"
# filename = "shrek.txt"

# get text file
txt_path = os.path.join(txt_folder, txt_fn)

# read the file
with open(txt_path, "r") as f:
    
    data = f.readlines()

## Preprocessing

### Remove things from original lines

In [104]:
import re

cleanned_data = []

for line in data:

    if "\n" in line[-1:]:
        
        # remove spaces
        cleaned_line = line[:-1]
     
    else:
        
        cleaned_line = line
        
    
    # if it's a page line skip
    if "page" in cleaned_line.lower():

        continue
    
    # remove all the names of actors and scenes
    if cleaned_line.isupper():

        # if ":" not in cleaned_line:
        #     print(cleaned_line)
        continue
        
    # remove asterisks
    if "*" in cleaned_line:
        
        # remove the star
        cleaned_line = cleaned_line.replace("*", "")
        
        if cleaned_line == "":
            
            continue
    
    if "www" in cleaned_line:
        continue

    # remove just numeric lines
    if cleaned_line.isdigit():
        
        continue
        
    if cleaned_line == "":
        
        continue
        
    cleanned_data.append(cleaned_line.lower())
    
print(f"There are total of {len(cleanned_data)} lines left")

PULP FICTION
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
WAITRESS
YOUNG WOMAN
YOUNG MAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG MAN
WAITRESS
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG MAN
YOUNG WOMAN
YOUNG WOMAN
YOUNG MAN
PUMPKIN
HONEY BUNNY
"PULP FICTION"
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES
VINCENT
JULES


### Word Preprocessing

In [53]:
# split into words
joinned_text = " ".join(cleanned_data)
all_words = joinned_text.split(" ")

# preprocess the words

# remove punctuation
table = str.maketrans("","", string.punctuation)
clean_words = [word.translate(table) for word in all_words]

# remove white space
clean_words = [word for word in clean_words if word.strip() != ""]

# remove stopwords
stop_words = set(stopwords.words('english'))
clean_words = [w for w in clean_words if not w in stop_words]

# # add to a dataframe
df_words = pd.DataFrame(clean_words, columns=["words"])

### Swear Words Analysis

In [54]:
is_swear = []
swear_col = []
swear_words = ['fuc', 'shit', 'crap', 'asshole', 'cunt', 'dick', 'pussy', 'bitch', 'cock']
film_length = 180

for word in clean_words:
    is_sw = False
    
    for sw in swear_words:
        
        if not(is_sw) and (sw in word) and ("cocktail" not in word):    
            
            is_sw = True
            is_swear.append(1)
            swear_col.append(sw)
            continue
    
    if not(is_sw):
        is_swear.append(0)
        swear_col.append("NA")
# # add the swear columns
df_words['is_swear'] = is_swear
df_words['swear_col'] = swear_col
df_words['movie_progress'] = np.linspace(0, 1, num=df_words.shape[0])

# print total
print(f"Total Swear Words : {df_words['is_swear'].sum()}")
print(f"Swear Words per Min : {round(df_words['is_swear'].sum() / film_length,1)}")

Total Swear Words : 253
Swear Words per Min : 1.4


In [55]:
df_words['swear_col'].value_counts()

NA         13619
fuc          180
shit          40
cock          12
bitch          5
dick           4
cunt           4
pussy          3
asshole        3
crap           2
Name: swear_col, dtype: int64

In [105]:


fig=px.line(df_words, x="movie_progress", y="is_swear")

fig.update_traces(opacity=0.3)

In [106]:
df_words['cum_swear'] = df_words['is_swear'].cumsum()

px.line(df_words, x="movie_progress", y="cum_swear", title="Cumulative Swear Word Occurrence", template="ggplot2")

## Word Clouds

In [58]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

text = " ".join(clean_words)
stopwords = set(STOPWORDS)


figsize(16, 7)

# # Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50,
                      width=500,
                      max_words = 100,
                      stopwords = stopwords,
                      background_color="black",
                      ).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

ModuleNotFoundError: No module named 'wordcloud'