# RNN Notebook

Notebook made for the RNN part of the assignment. Will contain all preprocessing and such, as well as implementation.

This is a placeholder cell, code will follow later.

In [1]:
# Importing libraries to work with
import re
import os
import nltk
import spacy
import torch
import shutil
import random
import numpy as np
import scipy as sci
import polars as pl
import pandas as pd
import gensim as gns
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# For the sake of Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Word Embedding
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Sklearn
from sklearn.decomposition import PCA

In [2]:
# Display the libraries' versions used in this notebook
version_list = {"NumPy Version:": np.__version__,
                "Polars Version:": pl.__version__,
                "MatPlotLib Version:": mpl.__version__,
                "Seaborn Version:": sns.__version__,
                "PyTorch Version:": torch.__version__,
                "NLTK Version:": nltk.__version__,
                "SpaCy Version:": spacy.__version__,
                "Gensim Version:": gns.__version__,
                "SciPy Version:": sci.__version__}

for (k, v) in version_list.items():
    print(k, v)

NumPy Version: 1.26.4
Polars Version: 1.16.0
MatPlotLib Version: 3.9.3
Seaborn Version: 0.13.2
PyTorch Version: 2.5.1+cpu
NLTK Version: 3.9.1
SpaCy Version: 3.8.2
Gensim Version: 4.3.3
SciPy Version: 1.13.1


In [3]:
# Defining path to install NLTK libraries in
NLTK_LIB_PATH = os.path.join("..", "venv_nlp", "Lib", "nltk_data")

# Defining download function
def download_libs():
    libraries = {
        "corpora\\stopwords": "stopwords",
        "corpora\\wordnet": "wordnet"
    }

    for resource, package in libraries.items():
        try:
            nltk.data.find(resource)
            print(f"{package.capitalize()} data exists.")
        except LookupError:
            print(f"Downloading {package}...")

            # Handle potential corrupted files
            resource_path = os.path.join(NLTK_LIB_PATH, *resource.split('\\'))
            if os.path.exists(resource_path):
                print(f"Removing corrupted file: {resource_path}")
                try:
                    shutil.rmtree(resource_path) if os.path.isdir(resource_path) else os.remove(resource_path)
                except Exception as e:
                    print(f"Error removing corrupted file {resource_path}: {e}")
            # Attempt download again
            nltk.download(package, download_dir=NLTK_LIB_PATH)
        except Exception as e:
            print(f"Unexpected error checking {package}: {e}")

In [4]:
try:
    os.makedirs(NLTK_LIB_PATH, exist_ok=True)
    print(f"Using NLTK data directory: {NLTK_LIB_PATH}")
    download_libs()
except PermissionError:
    print(f"Permission denied: Unable to create or write to directory '{NLTK_LIB_PATH}'")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Using NLTK data directory: ..\venv_nlp\Lib\nltk_data
Stopwords data exists.
Wordnet data exists.


# Data Loading

In [5]:
reviews = pl.read_csv("../data/IMDB Dataset.csv")

In [6]:
reviews.head(10)

review,sentiment
str,str
"""One of the other reviewers has…","""positive"""
"""A wonderful little production.…","""positive"""
"""I thought this was a wonderful…","""positive"""
"""Basically there's a family whe…","""negative"""
"""Petter Mattei's ""Love in the T…","""positive"""
"""Probably my all-time favorite …","""positive"""
"""I sure would like to see a res…","""positive"""
"""This show was an amazing, fres…","""negative"""
"""Encouraged by the positive com…","""negative"""
"""If you like original gut wrenc…","""positive"""


In [7]:
reviews.describe()

statistic,review,sentiment
str,str,str
"""count""","""50000""","""50000"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""A Turkish Bath sequence in…","""negative"""
"""25%""",,
"""50%""",,
"""75%""",,
"""max""","""ý thýnk uzak ýs the one of the…","""positive"""


# Preprocessing

In [8]:
# Create a stopword set
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add("im")

In [9]:
# Define a regex function to remove special characters, links, etc.
def regex_cleanse(text: str):
    # URLS
    text = re.sub(r'https\S+', '', text)

    # @<username>
    text = re.sub(r'@\w+', '', text)

    # Non-alphabet character
    text = re.sub(r'[^a-zA-Z ]', '', text)

    # #<word>
    text = re.sub(r'#\w+', '', text)

    # One character that doesn't belong to word or whitespace
    text = re.sub(r'[^\w\s]', '', text)

    # Attempt to remove linked pictures URLs
    text = re.sub(r'pic\w+', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text

In [10]:
# Loading language model
model = spacy.load('en_core_web_sm')

# Lemmatiser
def lemma(tokens):
    doc = model(tokens)
    return [token.lemma_ for token in doc]

In [11]:
# Removing emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [12]:
# Define a text preprocessing function to apply to all rows
def preprocess_text(text: str) -> list[str]:
    text = regex_cleanse(text.lower())
    text = remove_emoji(text)
    text = lemma(text)

    return text

In [13]:
# Creating a cleaned-preprocessed dataset
cleaned = reviews.with_columns(pl.col('review').map_elements(preprocess_text, return_dtype = list[str]))

In [14]:
cleaned.head()

review,sentiment
list[str],str
"[""one"", ""reviewer"", … ""side""]","""positive"""
"[""wonderful"", ""little"", … ""do""]","""positive"""
"[""think"", ""wonderful"", … ""friend""]","""positive"""
"[""basically"", ""there"", … ""ignore""]","""negative"""
"[""petter"", ""matteis"", … ""work""]","""positive"""


In [17]:
cleaned.write_ndjson(file = '../data/imdb_clean.json')