In [25]:
import os
import shutil
import kagglehub
import textstat
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np

nltk.download('punkt_tab')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
textstat.set_lang('en_US')

[nltk_data] Downloading package punkt_tab to /Users/win/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/win/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
current_dir = os.getcwd()
destination_dir = current_dir + "/resource"

if os.path.exists(destination_dir):
    print("Path to dataset files:", destination_dir)
else:
    source = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
    shutil.move(source, current_dir)
    os.rename(current_dir + "/1", destination_dir)
    print("Path to dataset files:", destination_dir)

Path to dataset files: /Users/win/dev/university/nlp/workshop/workshop-2/resource


In [27]:
def normalize_fleash_ease(score):
    if score <= 29:
        return "Confusing"
    elif score <= 49:
        return "Difficult"
    elif score <= 59:
        return "Fairly Difficult"
    elif score <= 69:
        return "Standard"
    elif score <= 79:
        return "Fairly Easy"
    elif score <= 89:
        return "Easy"
    elif score <= 100:
        return "Very Easy"

In [28]:
df = pd.read_csv("resource/IMDB Dataset.csv")

In [29]:
df['flesch-ease'] = df['review'].apply(textstat.flesch_reading_ease)

score = df['flesch-ease'].mean()

print(score)
print(normalize_fleash_ease(score))

64.13603669316
Standard


In [None]:
def clean_text(text):
    # replace <br /> with .
    cleaned_text = text.replace('<br />', '. ')
    # replace , with .
    cleaned_text = cleaned_text.replace(',', '.')
    # this regex will match everything that isn't letter, space or dot and remove them
    # i keep dot becuase i need them to beable to tokenize it to sentence
    cleaned_text = re.sub(r'[^a-zA-Z\s.]', '', cleaned_text)
    # this regex will match whitespace that is more than 1 character long and replace them with white space that is one character long
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    cleaned_text = cleaned_text.lower()

    return cleaned_text

In [31]:
def remove_stopwords(text):
    stop_set = set(stop_words)
    return " ".join([word for word in text.split() if word not in stop_set])

In [32]:
def process_sentence(sentence):
    words = nltk.word_tokenize(sentence);
    filtered_words = [stemmer.stem(w) for w in words if w not in stop_words]
    
    return " ".join(filtered_words)

In [33]:
def tokeniza_text(text):
    sentences = nltk.sent_tokenize(text)

    cleaned_sentences = []
    for s in sentences:
        processed_sentence = process_sentence(s)
        
        # remove dot, and clean whitespace and remove trailing whitespace with .strip()
        processed_sentence = processed_sentence.replace('.', '')
        processed_sentence = re.sub(r'\s+', ' ', processed_sentence).strip()
        
        # only add to output list if it not empty string
        if processed_sentence:
            cleaned_sentences.append(processed_sentence)

    return cleaned_sentences

In [34]:
def get_flesh_from_list(list):
    scores = [textstat.flesch_reading_ease(s) for s in list]

    return np.mean(scores)

In [35]:
df['cleaned-data'] = df['review'].apply(clean_text).apply(tokeniza_text)
df['flesch-ease'] = df['cleaned-data'].apply(get_flesh_from_list)

score = df['flesch-ease'].mean()
print(score)
print(normalize_fleash_ease(score))
print(df['cleaned-data'].iloc[0])

67.36963541967057
Standard
['one review mention watch oz episod youll hook', 'right', 'exactli happen', 'first thing struck oz brutal unflinch scene violenc', 'set right word go', 'trust', 'show faint heart timid', 'show pull punch regard drug', 'sex violenc', 'hardcor', 'classic use word', 'call oz nicknam given oswald maximum secur state penitentari', 'focus mainli emerald citi', 'experiment section prison cell glass front face inward', 'privaci high agenda', 'em citi home mani aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish scuffl', 'death stare', 'dodgi deal shadi agreement never far away', 'would say main appeal show due fact goe show wouldnt dare', 'forget pretti pictur paint mainstream audienc', 'forget charm', 'forget romanc oz doesnt mess around', 'first episod ever saw struck nasti surreal', 'couldnt say readi', 'watch', 'develop tast oz', 'got accustom high level graphic violenc', 'violenc', 'injustic crook guard wholl sold nickel', 'inmat wholl kill or