## Set up environment.

In [1]:
# Import libraries and modules.
import numpy  as np
import pandas as pd

import glob
import random
import re

# Increase number of columns that can be viewed in notebook.
pd.set_option('display.max_columns', 500)

# Set random seed for reproducibility.
random.seed(42)

In [2]:
# Specify the number of reviews of each type (pos and neg) to sample.
n = 500

# Set file location for output file.
tokens_csv = (f'./data/tokens_s{n*2}.csv')

# Print messages.
print(f'** {n*2} reviews will be sampled. **')
print()
print(f'** The tokenized data will be saved in "{tokens_csv}". **')

** 1000 reviews will be sampled. **

** The tokenized data will be saved in "./data/tokens_s1000.csv". **


## Load data.

In [3]:
# Create a dictionary in which the keys are part of the file names
# and the values are the classification labels (targets).
review_dict = {'pos' : 1,
               'neg' : 0}

# Initialize a list to store all the samples.
review_text = []

# Collect the samples for each review type.
for r in review_dict:
    
    # Generate a complete list of file names.
    review_files = glob.glob('./data/train/' + r + '/' '*.txt')
    
    # Randomly select files.
    selected_files = np.random.choice(review_files, n, replace=False)
   
    # Load each file.
    for sample_file in selected_files:
        
        with open(sample_file, 'r') as f:
                  data=f.read().replace('\n', '')
         
        # Label the text and save it.
        review_text.append([review_dict[r], data])
    

In [5]:
# Load reviews to a dataframe.
cols = ['label', 'text']

df = pd.DataFrame(review_text, columns=cols)

# Take a look.  Pos reviews were loaded first.
df.head()

Unnamed: 0,label,text
0,1,"Talk about a blast opening, ""Trampa Infernal"" ..."
1,1,After all these years I still consider this se...
2,1,I saw this film last night on cable and it is ...
3,1,"Based on a true story, this series is a gem wi..."
4,1,"It was life-changing, IT REALLY WAS!!!The Man ..."


In [6]:
# Verify neg reviews at the end.
df.tail()

Unnamed: 0,label,text
995,0,"Okay, I remember watching the first one, and b..."
996,0,"Flame in, flame out. That seems to be Gammera ..."
997,0,This movie is a re-write of the 1978 Warren Be...
998,0,"This movie has some fatal flaws in it, how som..."
999,0,This movie is a perfect example of an excellen...


In [7]:
# Take a look at the summary info.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
label    1000 non-null int64
text     1000 non-null object
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


 ## Data Cleaning 
 
 **Process:** 
 
- Remove duplicates.
- Handle null values.

### Remove duplicates.

In [8]:
# Drop duplicates.
df.drop_duplicates(subset=cols, keep='first', inplace=True)

# Reset index.
df.reset_index(drop=True, inplace=True)

# Verify update.
df.shape

(1000, 2)

### Handle null values.

In [9]:
# Drop reviews that are null.
df.dropna(axis=0, subset=['text'], inplace=True)

# Reset the index.
df.reset_index(drop=True, inplace=True)

## Tokenization 

**Process:** 

- Tokenize the text:
    - Remove the new line HTML tag.
    - Convert common contractions to words.
    - Remove non-letter characters.
    - Convert to lower case.
    - Remove extra spaces.

In [10]:
def text_sub(dict, text):
    # Function:  text_sub
    # This function applies regex commands to text.
    
    # Bundle the substitutions from the dictionary into a regrex module.
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

    # Execute the substitutions.
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [11]:
def tokenize_text(row, dict):
    # Function to tokenize, remove stop words from, and lemmatize words from a string of text.
    # Add the preprocessed string to the row as a new feature. 
    
    # Function:  tokenize_text
    # This function standardizes and separates the review input into tokens.
    # The tokens are saved as a new column in the dataframe.
    
    # Make regex substitutions.
    text          = text_sub(dict, row['text'].lower())
    
    # Remove non-letters.
    text_letters  = re.sub("[^a-zA-Z]", " ", text)
 
    # Remove extra spaces.
    text_tokens   = text_letters.lower().split()
    
    # Join the tokens back into a string and add as a new column.
    row['tokens'] = " ".join(text_tokens)

    # Return the row.
    return row

In [15]:
# Keep only needed columns.
df = df[['label', 'tokens']]

# Verify update.
df.head(25)

Unnamed: 0,label,tokens
0,1,talk about a blast opening trampa infernal has...
1,1,after all these years i still consider this se...
2,1,i saw this film last night on cable and it is ...
3,1,based on a true story this series is a gem wit...
4,1,it was life changing it really was the man in ...
5,1,the first two sequences of this movie set up t...
6,1,cat soup at first seems to be a very random an...
7,1,a fey story of a martian attempt to colonize e...
8,1,vincent price is follow up to house of wax the...
9,1,i saw fever pitch sort of by accident it was p...


## Save file.

In [13]:
# Save.
df.to_csv(tokens_csv, encoding='utf-8',index=False)