In [6]:
import os
import pandas as pd

# Define the paths to the train and test directories
train_dir = "../IMDb_reviews/test"
test_dir = "../IMDb_reviews/test"

# Define the columns of the dataframe
columns = ["Review", "Label"]

# Load the train set into a dataframe
train_data = []
for label in ["pos", "neg"]:
    for filename in os.listdir(os.path.join(train_dir, label)):
        with open(os.path.join(train_dir, label, filename), "r") as f:
            text = f.read()
            train_data.append([text, label])
train_df = pd.DataFrame(train_data, columns=columns)

# Load the test set into a dataframe
test_data = []
for label in ["pos", "neg"]:
    for filename in os.listdir(os.path.join(test_dir, label)):
        with open(os.path.join(test_dir, label, filename), "r") as f:
            text = f.read()
            test_data.append([text, label])
test_df = pd.DataFrame(test_data, columns=columns)

# Save the dataframes to CSV files
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)


In [10]:
import re
import pandas as pd
import spacy

# Load the CSV files into pandas dataframes
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Print out the first few reviews before preprocessing
print("Sample reviews before preprocessing:\n")
print(train_df.head())
print(test_df.head())


# Define a function to preprocess the text
def preprocess(text):
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers and special characters
    text = re.sub('[^a-zA-Z\s]+', '', text)
    
    # Tokenize the text and remove stop words
    tokens = [token.text for token in nlp(text) if not token.is_stop]
    
    # Join the tokens back into a string
    return ' '.join(tokens)

# Apply the preprocessing function to the Review column of the train and test dataframes
train_df["Review"] = train_df["Review"].apply(preprocess)
test_df["Review"] = test_df["Review"].apply(preprocess)

# Print out the first few reviews after preprocessing
print("\nSample reviews after preprocessing:\n")
print(train_df.head())
print(test_df.head())

# Count the number of positive and negative reviews in the preprocessed data
pos_count = len(train_data[train_data["Label"]=="pos"])
neg_count = len(train_data[train_data["Label"]=="neg"])
print("\nNumber of positive reviews: ", pos_count)
print("Number of negative reviews: ", neg_count)

Sample reviews before preprocessing:

                                              Review Label
0  "The Dresser" is a small but absolutely wonder...   pos
1  Most of this political thriller presented as a...   pos
2  My favorite film this year. Great characters a...   pos
3  There was a time when not all animation was Di...   pos
4  I always liked listening to Buddy Holly and fe...   pos
                                              Review Label
0  "The Dresser" is a small but absolutely wonder...   pos
1  Most of this political thriller presented as a...   pos
2  My favorite film this year. Great characters a...   pos
3  There was a time when not all animation was Di...   pos
4  I always liked listening to Buddy Holly and fe...   pos


KeyboardInterrupt: 