# WorkShop_2 Pre-processing

- Use the IMDB Dataset of 50K Movie Reviews (train and test)
- Step 1: Text cleaning
    - Remove special chars, numbers, and extra spaces
- Step 2. Tokenization
    - Split into sentences and words
- Step 3. Lowercasing and Stop word removal
    - Covert text to lowercase
- Step 4 Emoticons, Stemming and Lemmatization
    - Final: Check readability score Flesch-Kincaid (report in class)

In [1]:
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import textstat

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define preprocessing functions
def clean_text(text):
    """
    Remove special characters and numbers from text, and remove extra spaces.
    """
    text = re.sub(r'[^A-Za-z\\s]', '', text)
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

def tokenize_sentences(text):
    """Tokenize text into sentences."""
    return sent_tokenize(text)

def tokenize_words(text):
    """Tokenize text into words."""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove stopwords from a list of tokens."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def to_lowercase(tokens):
    """Convert tokens to lowercase."""
    return [word.lower() for word in tokens]

def stem_tokens(tokens):
    """Apply stemming to tokens."""
    ps = PorterStemmer()
    return [ps.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    """Apply lemmatization to tokens."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def calculate_readability(text):
    """Calculate the Flesch-Kincaid grade level of the text."""
    return textstat.flesch_kincaid_grade(text)


In [3]:
# Function to preprocess text step-by-step
def preprocess_with_grading(text):
    """
    Preprocess text through multiple steps and calculate readability for each step.
    """
    results = {"Original": text, "Steps": []}
    steps = [
        ("Clean Text", lambda t: clean_text(t)),
        ("Tokenize Sentences", lambda t: ' '.join(tokenize_sentences(t))),
        ("Tokenize Words", lambda t: ' '.join(tokenize_words(t))),
        ("Lowercase", lambda t: ' '.join(to_lowercase(tokenize_words(t)))),
        ("Remove Stopwords", lambda t: ' '.join(remove_stopwords(to_lowercase(tokenize_words(t))))),
        ("Stemming", lambda t: ' '.join(stem_tokens(remove_stopwords(to_lowercase(tokenize_words(t)))))),
        ("Lemmatization", lambda t: ' '.join(lemmatize_tokens(stem_tokens(remove_stopwords(to_lowercase(tokenize_words(t)))))))
    ]

    for name, func in steps:
        text = func(text)
        readability = calculate_readability(text)
        results["Steps"].append({"Step": name, "Text": text, "Readability": readability})

    return results


In [4]:
# Load CSV files
f1 = 'data/test.csv'
f2 = 'data/train.csv'

df1 = pd.read_csv(f1)
df2 = pd.read_csv(f2)

# Process the files
def process_file(df, file_name):
    """
    Process a DataFrame by applying text preprocessing steps and saving results.
    """
    results = []
    for _, row in df.iterrows():
        processed = preprocess_with_grading(row['text'])
        results.append(processed)

    # Save intermediate results to CSV
    steps_data = []
    for entry in results:
        original = entry["Original"]
        for step in entry["Steps"]:
            steps_data.append(
                {"Original": original, "Step": step["Step"], "Text": step["Text"], "Readability": step["Readability"]})

    steps_df = pd.DataFrame(steps_data)
    steps_df.to_csv(f"{file_name}_preprocessed_steps.csv", index=False)

    # Calculate overall readability grade
    overall_readability = steps_df.groupby("Step")["Readability"].mean().to_dict()
    print(f"Overall Readability Grades for {file_name}: {overall_readability}")

    return overall_readability


In [5]:
# Process test and train files
test_readability = process_file(df1, "test")
train_readability = process_file(df2, "train")


Overall Readability Grades for test: {'Clean Text': 2673.376664, 'Lemmatization': 2670.592336, 'Lowercase': 2673.376664, 'Remove Stopwords': 2673.376664, 'Stemming': 2670.795768, 'Tokenize Sentences': 2673.376664, 'Tokenize Words': 2673.376664}
Overall Readability Grades for train: {'Clean Text': 2741.811, 'Lemmatization': 2738.927552, 'Lowercase': 2741.811, 'Remove Stopwords': 2741.811, 'Stemming': 2739.1517519999998, 'Tokenize Sentences': 2741.811, 'Tokenize Words': 2741.811}
