In [1]:
# Import Libraries
import pandas as pd

from datasets import Dataset, DatasetDict, load_dataset

import string
from bs4 import BeautifulSoup
import unicodedata
from nltk.corpus import stopwords

# EDA

In [2]:
# Load dataset
ds = load_dataset('imdb')

In [3]:
# Review dataset features
print(ds)
print(ds['train'].features)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [4]:
# Set up dataframes for data
trainDF = pd.DataFrame(ds['train'])
testDF = pd.DataFrame(ds['test'])

In [5]:
# assign the splits
train = Dataset.from_pandas(trainDF)
test = Dataset.from_pandas(testDF)
# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [6]:
trainDF.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [7]:
# Reading ond of the negative and positive reviews
print("Negative review: \n" + trainDF[trainDF['label']==0].iloc[3,0])
print("Positive review: \n" + trainDF[trainDF['label']==1].iloc[3,0])

Negative review: 
This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.<br /><br />The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.<br /><br />A movie of its time, and place. 2/10.
Positive review: 
*Contains spoilers due to me having to describe some film techniques, so read at your own risk!*<br /><br />I loved this film. The use of tinting in some of the scenes makes it seem like an old photograph come to life. I also enjoyed the projection of peopl

Remove or replace: 
- capitalization
- remmove punctuation
- é and other none standard alphabet characters
- html tags like `<br />`

In [8]:
# Checking label data shows even split of positive and negative reviews
print(trainDF['label'].value_counts())
print(testDF['label'].value_counts())

label
0    12500
1    12500
Name: count, dtype: int64
label
0    12500
1    12500
Name: count, dtype: int64


# Preprocessing data

In [9]:
# Set up function to remove punctuation, capitalization, and stop words
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove latin characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
      
    return text


In [10]:
# Apply the function to the text columns
for df in [trainDF, testDF]:
    df['cleanText'] = df['text'].apply(clean_text)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [11]:
# Checking one row that text is cleaned
trainDF.iloc[3,0]

"This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.<br /><br />The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.<br /><br />A movie of its time, and place. 2/10."

# BoW Tokenizing Text

In [12]:
def tokenize(text):
    tokens = text.split(" ")
    return tokens

In [13]:
# Apply the function to the text columns
for df in [trainDF, testDF]:
    df['bowTokens'] = df['cleanText'].apply(tokenize)

### How many documents are in your dataset? How many total words make up the vocabulary in your dataset?

In [14]:
# number of documents
print(trainDF.count().iloc[0])
print(testDF.count().iloc[0]) 

25000
25000


In [15]:
# Count number of words in trainDF
allTrainWords = [word for sublist in trainDF['bowTokens'] for word in sublist]
allTestWords = [word for sublist in testDF['bowTokens'] for word in sublist]

# Combine the lists
allWords = allTrainWords + allTestWords

# Convert the list to a set to get unique words
uniqueWordsTrain = set(allTrainWords)
uniqueWordsTest = set(allTestWords)
uniqueWordsAll = set(allWords)

In [16]:
print(len(uniqueWordsTrain), "unique train words")
print(len(uniqueWordsTest), "unique test words")
print(len(uniqueWordsAll), "unique words")

141720 unique train words
138729 unique test words
221286 unique words


In [17]:
# Exporting cleaned data to CSV
trainDF[['cleanText','bowTokens','label']].to_csv('..//data//preprocessingTrainDF.csv',index=False)
testDF[['cleanText','bowTokens','label']].to_csv('..//data//preprocessingTestDF.csv',index=False)