# Key components in NLP
- **Text Preprocessing** - Cleaning and preparing the text data for analysis
    - Tokenization - Splitting the text data into words or sentences
    - Removing Stopwords -
    - Stemming/Lemmatization - Reducing a word into their base form
- **Text representation** -
    - Bag of Word (BoW)
    - TF-IDF- Weighs the terms by their frequency and importance across documents
    - Word Embedding -
    - Sentimental Analysis - Positive, negative or Neutral
    - Name Entity recognition
    - machine Translaation
    - Text Classification -
    - Language Generation
    - Speech recognition and synthesis

  

# Text Preprocessing

### Step 1: Load the libraries


In [1]:
! pip install nltk



In [2]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
# Download the neccesary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### Step 2: Load the dataset

In [24]:
# Download the dataset from NLTK
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [25]:
# Load the dataset into a dataframe
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

## convert the datfram
df = pd.DataFrame(documents, columns = ['review', 'sentiment'])

In [26]:
# Preview the dataset
df.head(20)

Unnamed: 0,review,sentiment
0,"[plot, :, two, teen, couples, go, to, a, churc...",neg
1,"[the, happy, bastard, ', s, quick, movie, revi...",neg
2,"[it, is, movies, like, these, that, make, a, j...",neg
3,"["", quest, for, camelot, "", is, warner, bros, ...",neg
4,"[synopsis, :, a, mentally, unstable, man, unde...",neg
5,"[capsule, :, in, 2176, on, the, planet, mars, ...",neg
6,"[so, ask, yourself, what, "", 8mm, "", (, "", eig...",neg
7,"[that, ', s, exactly, how, long, the, movie, f...",neg
8,"[call, it, a, road, trip, for, the, walking, w...",neg
9,"[plot, :, a, young, french, boy, sees, his, pa...",neg


### Text Preprocessing
1. Lowecasing

In [27]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [11]:
# Convert test to lowercase
df['review'] = df['review'].apply(lambda x: [word.lower() for word in x])

2. Tokenization
- Tokenize the text into individual words or senteces

In [28]:
# Tokenization (ALready done in the loading process)
df['review'] = df['review'].apply(lambda x: word_tokenize(' '.join(x)))


In [13]:
df['review']

Unnamed: 0,review
0,"[plot, :, two, teen, couples, go, to, a, churc..."
1,"[the, happy, bastard, ', s, quick, movie, revi..."
2,"[it, is, movies, like, these, that, make, a, j..."
3,"[``, quest, for, camelot, ``, is, warner, bros..."
4,"[synopsis, :, a, mentally, unstable, man, unde..."
...,...
1995,"[wow, !, what, a, movie, ., it, ', s, everythi..."
1996,"[richard, gere, can, be, a, commanding, actor,..."
1997,"[glory, --, starring, matthew, broderick, ,, d..."
1998,"[steven, spielberg, ', s, second, epic, film, ..."


In [14]:
# Sentence tokenization
df['review'] = df['review'].apply(lambda x: sent_tokenize(' '.join(x)))

In [15]:
df['review']

Unnamed: 0,review
0,[plot : two teen couples go to a church party ...
1,[the happy bastard ' s quick movie review damn...
2,[it is movies like these that make a jaded mov...
3,"[`` quest for camelot `` is warner bros . ', f..."
4,[synopsis : a mentally unstable man undergoing...
...,...
1995,"[wow !, what a movie ., it ' s everything a mo..."
1996,"[richard gere can be a commanding actor , but ..."
1997,"[glory -- starring matthew broderick , denzel ..."
1998,[steven spielberg ' s second epic film on worl...


### Remove Stopwords

In [29]:
# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove the stopwords
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

In [17]:
df['review']

Unnamed: 0,review
0,[plot : two teen couples go to a church party ...
1,[the happy bastard ' s quick movie review damn...
2,[it is movies like these that make a jaded mov...
3,"[`` quest for camelot `` is warner bros . ', f..."
4,[synopsis : a mentally unstable man undergoing...
...,...
1995,"[wow !, what a movie ., it ' s everything a mo..."
1996,"[richard gere can be a commanding actor , but ..."
1997,"[glory -- starring matthew broderick , denzel ..."
1998,[steven spielberg ' s second epic film on worl...


## Remove the punctuation and Non-Alphanumeric Characaters

In [30]:
df['review'] = df['review'].apply(lambda x: [re.sub(r'\W+', '', word) for word in x if word.isalpha()])

In [31]:
df['review']

Unnamed: 0,review
0,"[plot, two, teen, couples, go, church, party, ..."
1,"[happy, bastard, quick, movie, review, damn, b..."
2,"[movies, like, make, jaded, movie, viewer, tha..."
3,"[quest, camelot, warner, bros, first, feature,..."
4,"[synopsis, mentally, unstable, man, undergoing..."
...,...
1995,"[wow, movie, everything, movie, funny, dramati..."
1996,"[richard, gere, commanding, actor, always, gre..."
1997,"[glory, starring, matthew, broderick, denzel, ..."
1998,"[steven, spielberg, second, epic, film, world,..."


## Stemming
- Reducing words to their root form


In [33]:
# Iniitialise stemmer
stemmer = PorterStemmer()

# APply stemmer
df['review'] = df['review'].apply(lambda x: [stemmer.stem(word) for word in x])

In [32]:
df['review']

Unnamed: 0,review
0,"[plot, two, teen, couples, go, church, party, ..."
1,"[happy, bastard, quick, movie, review, damn, b..."
2,"[movies, like, make, jaded, movie, viewer, tha..."
3,"[quest, camelot, warner, bros, first, feature,..."
4,"[synopsis, mentally, unstable, man, undergoing..."
...,...
1995,"[wow, movie, everything, movie, funny, dramati..."
1996,"[richard, gere, commanding, actor, always, gre..."
1997,"[glory, starring, matthew, broderick, denzel, ..."
1998,"[steven, spielberg, second, epic, film, world,..."


### Lemmatization


In [34]:
# Innitialize the lemmatization
lemmatizer = WordNetLemmatizer()

# Apply the lemmatizer
df['review'] = df['review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

## Join back the tokens into sentences

In [35]:
# Join tokens back into a string
df['cleaned_review'] = df['review'].apply(lambda x: ' '.join(x))

# preview the dataset
df['cleaned_review'].head()

Unnamed: 0,cleaned_review
0,plot two teen coupl go church parti drink driv...
1,happi bastard quick movi review damn bug got h...
2,movi like make jade movi viewer thank invent t...
3,quest camelot warner bro first featur length f...
4,synopsi mental unstabl man undergo psychothera...


# Save the data


In [36]:
df.to_csv('cleaned_movie_reviews.csv', index=False)