Preprocessing steps: 

- Convert to lowercase (Gabe)
- Remove punctuation (Gabe)
- Standardizing whitespace (Gabe)
- Remove/convert numbers (Gabe)
- Remove Stop words (Gabe)
- Tokenization (Gabe)

In [1]:
import pandas as pd
import spacy 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

In [2]:
df = pd.read_csv("C:\MSAAI\AAI-520\Generative_ChatBot_Final\clean_dataset\movie_lines.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304713 non-null  object
 1   character_id    304713 non-null  object
 2   movie_id        304713 non-null  object
 3   character_name  304670 non-null  object
 4   line_text       304446 non-null  object
dtypes: object(5)
memory usage: 11.6+ MB


In [4]:
df.head()

Unnamed: 0,line_id,character_id,movie_id,character_name,line_text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


Since we have NaN values. Lets decide what to do. For simplicity, at this time let's remove them.  

In [5]:
print(df.isnull().sum())

line_id             0
character_id        0
movie_id            0
character_name     43
line_text         267
dtype: int64


Checking to make sure there are NaN values based on the df['line_text'] column. 

In [6]:
df = df.dropna()

In [7]:
print(df.isnull().sum())

line_id           0
character_id      0
movie_id          0
character_name    0
line_text         0
dtype: int64


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304403 entries, 0 to 304712
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304403 non-null  object
 1   character_id    304403 non-null  object
 2   movie_id        304403 non-null  object
 3   character_name  304403 non-null  object
 4   line_text       304403 non-null  object
dtypes: object(5)
memory usage: 13.9+ MB


In [9]:
def text_cleaner(text):
    """puts a dataframe column filled with strings as its observations 
    through cleaning steps in prepatation for tokenization"""
    
    # Remove digits
    text = re.sub(r'\d', '', text)

    # make all text lowercase:
    text = text.lower()
    
    # Remove any non-alphanumeric characters, excluding whitespace:
    text = re.sub('[^\w\s]', '', text)
    
    # Standardizing whitespace:
    text = ' '.join(text.split()).replace('\u00A0', ' ').strip()

    return text 
 

In [10]:
df['line_text']

0                                              They do not!
1                                               They do to!
2                                                I hope so.
3                                                 She okay?
4                                                 Let's go.
                                ...                        
304708    Lord Chelmsford seems to want me to stay back ...
304709    I'm to take the Sikali with the main column to...
304710                             Your orders, Mr Vereker?
304711    Good ones, yes, Mr Vereker. Gentlemen who can ...
304712    Colonel Durnford... William Vereker. I hear yo...
Name: line_text, Length: 304403, dtype: object

In [11]:
df['line_text'] = df['line_text'].apply(text_cleaner)

In [12]:
df.head()

Unnamed: 0,line_id,character_id,movie_id,character_name,line_text
0,L1045,u0,m0,BIANCA,they do not
1,L1044,u2,m0,CAMERON,they do to
2,L985,u0,m0,BIANCA,i hope so
3,L984,u2,m0,CAMERON,she okay
4,L925,u0,m0,BIANCA,lets go


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304403 entries, 0 to 304712
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304403 non-null  object
 1   character_id    304403 non-null  object
 2   movie_id        304403 non-null  object
 3   character_name  304403 non-null  object
 4   line_text       304403 non-null  object
dtypes: object(5)
memory usage: 13.9+ MB


In [14]:
print(df.isnull().sum())

line_id           0
character_id      0
movie_id          0
character_name    0
line_text         0
dtype: int64


In [15]:
df.reset_index(drop=True, inplace=True)

In [16]:
# save new CSV for Cleaned_Movie_Lines
df.to_csv('cleaned_lines.csv', index=False)

In [18]:
cleaned_lines_df_1 = pd.read_csv('C:\\MSAAI\AAI-520\\Generative_ChatBot_Final\\cleaned_lines.csv')

In [19]:
print(cleaned_lines_df_1.isnull().sum())

line_id            0
character_id       0
movie_id           0
character_name     0
line_text         55
dtype: int64


In [20]:
cleaned_lines_df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304403 entries, 0 to 304402
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304403 non-null  object
 1   character_id    304403 non-null  object
 2   movie_id        304403 non-null  object
 3   character_name  304403 non-null  object
 4   line_text       304348 non-null  object
dtypes: object(5)
memory usage: 11.6+ MB


In [21]:
df1 = cleaned_lines_df_1.dropna()

In [22]:
df1.reset_index(drop=True, inplace=True)

In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304348 entries, 0 to 304347
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304348 non-null  object
 1   character_id    304348 non-null  object
 2   movie_id        304348 non-null  object
 3   character_name  304348 non-null  object
 4   line_text       304348 non-null  object
dtypes: object(5)
memory usage: 11.6+ MB


In [28]:
df1.to_csv('cleaned_movie_lines.csv', index=False)

In [30]:
df2 = pd.read_csv('C:\\MSAAI\\AAI-520\\Generative_ChatBot_Final\\cleaned_movie_lines.csv')

In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304348 entries, 0 to 304347
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304348 non-null  object
 1   character_id    304348 non-null  object
 2   movie_id        304348 non-null  object
 3   character_name  304348 non-null  object
 4   line_text       304348 non-null  object
dtypes: object(5)
memory usage: 11.6+ MB


-----------------------------------------------------------------------------------------------

Function Pool:

In [None]:
def text_preprocessor(text):
    """puts text through pre-tokenization preprocessing steps"""
    # Standardizing whitespace:
    text = ' '.join(text.split()).replace('\u00A0', ' ').strip()
    
    return text 


def tokenize(text):
    """Tokenizes text"""
    doc = nlp(text)
    return doc

df['doc'] = df['line_text'].apply(text_preprocessor).apply(tokenize)

def entity_exctractor(doc):
    """Extracts named entities from the text"""
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return entities

df['entities'] = df['doc'].apply(entity_exctractor)


def pos_tagger(doc):
    """Extracts POS tags"""
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

df['pos_tags'] = df['doc'].apply(pos_tagger) 