### Set up environment.

In [1]:
# Import libraries and modules.
import numpy  as np
import pandas as pd

import random
import time

# Import Natural Language Toolkit modules.
from nltk.stem     import WordNetLemmatizer

# Increase number of columns that can be viewed in notebook.
pd.set_option('display.max_columns', 500)

# Set random seed for reproducibility.
random.seed(42)

In [2]:
# Specify the number of reviews of each type (pos and neg) being sampled.
n = 50

# Specify whether the lemmas were created by the prod version of WordNet (True or False).
prod = True
if prod:
    env = 'prod'
else:
    env = 'dev'

# Set file location for input file.
tokens_csv = (f'./data/tokens_s{n*2}.csv')

# Set file location for output files.
lemmas_csv = (f'./data/lemmas_{env}_s{n*2}.csv')
time_csv = (f'./data/lemmas_{env}_s{n*2}_time.csv')

# Print messages.
print(f'** The tokenized  data will be loaded from "{tokens_csv}". **')
print()
print(f'** The lemmatized data will be saved in    "{lemmas_csv}". **')
print()
print(f'** The elapsed time data will be saved in  "{time_csv}". **')

** The tokenized  data will be loaded from "./data/tokens_s100.csv". **

** The lemmatized data will be saved in    "./data/lemmas_prod_s100.csv". **

** The elapsed time data will be saved in  "./data/lemmas_prod_s100_time.csv". **


### Load data.

In [3]:
# Load the data.
df = pd.read_csv(tokens_csv)

# Take a look.
df.head()

Unnamed: 0,label,tokens
0,1,i was very impressed with this small independe...
1,1,shot in the heart is wonderful it brilliantly ...
2,1,i have not seen this in over yrs but i still r...
3,1,police story brought hong kong movies to moder...
4,1,the word classic is thrown around too loosely ...


In [4]:
# Take a look at the data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
label     100 non-null int64
tokens    100 non-null object
dtypes: int64(1), object(1)
memory usage: 1.6+ KB


## Lemmatization 

**Process:** 

- Lemmatize the tokens:
    - Use the WordNet production version to obtain lemmas.

In [5]:
# Instantiate Lemmatizer.
lemmatizer = WordNetLemmatizer()

# Initialize list for elapsed time data.
time_data = []

# Loop through the dataframe.
for i in range(df.shape[0]):
    
    # Start the clock.
    t0 = time.time()
   
    # Lemmatize words.
    lemmas = [lemmatizer.lemmatize(w) for w in df.iloc[i]['tokens'].split()]
    
    # Stop the clock.
    t_end = time.time()
    
    # Save results.
    time_data.append([len(lemmas), (t_end - t0)]) 
    
    # Join the lemmas back into a single string and save it.
    df.loc[i,'lemmas'] = " ".join(lemmas) 

# Take a look.
df.head()

Unnamed: 0,label,tokens,lemmas
0,1,i was very impressed with this small independe...,i wa very impressed with this small independen...
1,1,shot in the heart is wonderful it brilliantly ...,shot in the heart is wonderful it brilliantly ...
2,1,i have not seen this in over yrs but i still r...,i have not seen this in over yr but i still re...
3,1,police story brought hong kong movies to moder...,police story brought hong kong movie to modern...
4,1,the word classic is thrown around too loosely ...,the word classic is thrown around too loosely ...


In [6]:
# Keep only needed columns.
df = df[['label', 'lemmas']]

# Verify update.
df.head()

Unnamed: 0,label,lemmas
0,1,i wa very impressed with this small independen...
1,1,shot in the heart is wonderful it brilliantly ...
2,1,i have not seen this in over yr but i still re...
3,1,police story brought hong kong movie to modern...
4,1,the word classic is thrown around too loosely ...


In [9]:
# Load the time data.
time_df = pd.DataFrame(time_data, columns=['lemma count', 'elapsed_time'])

# Take a look.
time_df.head()

Unnamed: 0,lemma count,elapsed_time
0,123,1.665904
1,50,0.000375
2,146,0.000672
3,142,0.000728
4,96,0.000505


### Save data.

In [8]:
# Save the NLP data.
df.to_csv(lemmas_csv, encoding='utf-8', index=False)

# Save the elapsed time data.
time_df.to_csv(time_csv, encoding='utf-8', index=False)