# Download and Extract Cornell Movie Review Polarity Dataset

In [None]:
import urllib.request
import tarfile
import os

# URL of the dataset
url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz"
dataset_folder = "rt-polaritydata"

# Function to download the dataset
def download_dataset(url, download_path):
    if not os.path.exists(download_path):
        print(f"Downloading dataset from {url}...")
        urllib.request.urlretrieve(url, download_path)
        print("Download complete.")
    else:
        print("Dataset already downloaded.")

# Function to extract the tar.gz file
def extract_dataset(tar_path, extract_to):
    if not os.path.exists(extract_to):
        print(f"Extracting {tar_path}...")
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extractall(path=extract_to)
        print(f"Extraction complete. Files extracted to {extract_to}")
    else:
        print("Dataset already extracted.")

# Download and extract dataset
download_path = "rt-polaritydata.tar.gz"
download_dataset(url, download_path)
extract_dataset(download_path, dataset_folder)

# List the extracted files
print(f"Extracted files: {os.listdir(dataset_folder)}")


Downloading dataset from https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz...
Download complete.
Extracting rt-polaritydata.tar.gz...
Extraction complete. Files extracted to rt-polaritydata
Extracted files: ['rt-polaritydata.README.1.0.txt', 'rt-polaritydata']


In [None]:
%cd rt-polaritydata/rt-polaritydata

/content/rt-polaritydata/rt-polaritydata


In [None]:
%ls

rt-polarity.neg  rt-polarity.pos


## Performing TF-IDF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
pos_reviews = open("rt-polarity.pos", "r", encoding="latin-1").readlines()
neg_reviews = open("rt-polarity.neg", "r", encoding="latin-1").readlines()

# Create labels
pos_labels = [1] * len(pos_reviews)
neg_labels = [0] * len(neg_reviews)

# Combine data
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels

# Train-validation-test split
train_texts = all_reviews[:8000]
train_labels = all_labels[:8000]

val_texts = all_reviews[8000:9000]
val_labels = all_labels[8000:9000]

test_texts = all_reviews[9000:]
test_labels = all_labels[9000:]

# Text Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)


# Without preprocessing, Perform Logistic Regression

In [None]:
# Model training
clf = LogisticRegression()
clf.fit(X_train, train_labels)

# Validation
val_preds = clf.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(val_labels, val_preds)}")

# Testing
test_preds = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(test_labels, test_preds)}")
print(classification_report(test_labels, test_preds))

Validation Accuracy: 0.417
Test Accuracy: 0.4362214199759326
              precision    recall  f1-score   support

           0       1.00      0.44      0.61      1662
           1       0.00      0.00      0.00         0

    accuracy                           0.44      1662
   macro avg       0.50      0.22      0.30      1662
weighted avg       1.00      0.44      0.61      1662



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SVM Classifier

In [None]:
# ==================
# SVM Classifier
# ==================
from sklearn import svm
print("\nTraining SVM Classifier...")
svm_clf = svm.SVC(kernel='linear', C=1.0)
svm_clf.fit(X_train, train_labels)

# Validation accuracy for SVM
val_preds_svm = svm_clf.predict(X_val)
print(f"SVM Validation Accuracy: {accuracy_score(val_labels, val_preds_svm)}")

# Test accuracy for SVM
test_preds_svm = svm_clf.predict(X_test)
print(f"SVM Test Accuracy: {accuracy_score(test_labels, test_preds_svm)}")
print("SVM Classification Report:")
print(classification_report(test_labels, test_preds_svm))



Training SVM Classifier...
SVM Validation Accuracy: 0.546
SVM Test Accuracy: 0.5511432009626955
SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.55      0.71      1662
           1       0.00      0.00      0.00         0

    accuracy                           0.55      1662
   macro avg       0.50      0.28      0.36      1662
weighted avg       1.00      0.55      0.71      1662



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Random Forest Classifier

In [None]:

# ==================
# Random Forest Classifier
# ==================
from sklearn.ensemble import RandomForestClassifier
print("\nTraining Random Forest Classifier...")
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, train_labels)

# Validation accuracy for Random Forest
val_preds_rf = rf_clf.predict(X_val)
print(f"Random Forest Validation Accuracy: {accuracy_score(val_labels, val_preds_rf)}")

# Test accuracy for Random Forest
test_preds_rf = rf_clf.predict(X_test)
print(f"Random Forest Test Accuracy: {accuracy_score(test_labels, test_preds_rf)}")
print("Random Forest Classification Report:")
print(classification_report(test_labels, test_preds_rf))


Training Random Forest Classifier...
Random Forest Validation Accuracy: 0.291
Random Forest Test Accuracy: 0.31167268351383876
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.31      0.48      1662
           1       0.00      0.00      0.00         0

    accuracy                           0.31      1662
   macro avg       0.50      0.16      0.24      1662
weighted avg       1.00      0.31      0.48      1662



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# With preprocessing

*   Vectorization through TF-IDF
*   Running for Logistic Regression, Support Vector Machine, Random Forest Classifier



In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk

# Download required nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords - {'not', 'no', 'nor'}

# Preprocessing functions
def to_lowercase(text):
    return text.lower()

def remove_special_chars(text):
    return re.sub(r"[^a-zA-Z\s]", "", text)

def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([word for word in words if word not in custom_stopwords])

def lemmatize_text(text):
    words = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(word) for word in words])

def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_special_chars(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

# Load data
pos_reviews = open("rt-polarity.pos", "r", encoding="latin-1").readlines()
neg_reviews = open("rt-polarity.neg", "r", encoding="latin-1").readlines()

# Create labels
pos_labels = [1] * len(pos_reviews)
neg_labels = [0] * len(neg_reviews)

# Combine data
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels

# Preprocess all reviews
all_reviews = [preprocess_text(review) for review in all_reviews]

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(all_reviews, all_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Text Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Model Training and Evaluation
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

results = {}

for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_tfidf, y_train)

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    results[model_name] = {
        "accuracy": accuracy,
        "precision": report['weighted avg']['precision'],
        "recall": report['weighted avg']['recall'],
        "f1-score": report['weighted avg']['f1-score']
    }

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                        accuracy  precision   recall  f1-score
Logistic Regression      0.75750   0.757514  0.75750  0.757497
Support Vector Machine   0.76125   0.761276  0.76125  0.761244
Random Forest            0.68875   0.689490  0.68875  0.688446


# Making Custom Model

*   Using Glove Embeddings to prepare Embedding Matrix
*   Sequential + Embedding + LSTM



In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2024-09-25 19:03:37--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-09-25 19:03:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-09-25 19:03:38--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!ls

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip     rt-polarity.pos
glove.6B.200d.txt  glove.6B.50d.txt   rt-polarity.neg


In [None]:
!pip install tensorflow
!pip install keras
!pip install Keras-Preprocessing

Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [None]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, LSTM
import nltk

# Download required nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords - {'not', 'no', 'nor'}

# Preprocessing functions
def to_lowercase(text):
    return text.lower()

def remove_special_chars(text):
    return re.sub(r"[^a-zA-Z\s]", "", text)

def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([word for word in words if word not in custom_stopwords])

def lemmatize_text(text):
    words = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(word) for word in words])

def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_special_chars(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

# Load data
pos_reviews = open("rt-polarity.pos", "r", encoding="latin-1").readlines()
neg_reviews = open("rt-polarity.neg", "r", encoding="latin-1").readlines()

# Create labels
pos_labels = [1] * len(pos_reviews)
neg_labels = [0] * len(neg_reviews)

# Combine data
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels

# Preprocess all reviews
all_reviews = [preprocess_text(review) for review in all_reviews]

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(all_reviews, all_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Specify GloVe file path (download and use 'glove.6B.100d.txt' or any other variant)
glove_file_path = 'glove.6B.300d.txt'  # Replace with the correct path to your GloVe file
embeddings_index = load_glove_embeddings(glove_file_path)

# Prepare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_length = max(len(x) for x in X_train_seq)  # Get the maximum length of the sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# Prepare embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(20000, len(word_index) + 1)  # Limit to 20,000 words
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Build the optimized model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=False))  # LSTM layer
model.add(Dropout(0.5))  # Dropout layer
model.add(Dense(64, activation='relu'))  # Increased complexity
model.add(Dense(1, activation='sigmoid'))

# Compile the model with a different learning rate if necessary
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
X_train_pad = np.array(X_train_pad)
y_train = np.array(y_train)
X_val_pad = np.array(X_val_pad)
y_val = np.array(y_val)
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])

# Evaluate the model
X_test_pad = np.array(X_test_pad)
y_test = np.array(y_test)
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 101ms/step - accuracy: 0.6680 - loss: 0.6017 - val_accuracy: 0.7480 - val_loss: 0.5111
Epoch 2/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 103ms/step - accuracy: 0.7700 - loss: 0.4779 - val_accuracy: 0.7630 - val_loss: 0.4844
Epoch 3/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 108ms/step - accuracy: 0.7836 - loss: 0.4514 - val_accuracy: 0.7686 - val_loss: 0.4872
Epoch 4/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 103ms/step - accuracy: 0.8299 - loss: 0.3747 - val_accuracy: 0.7598 - val_loss: 0.4811
Epoch 5/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 110ms/step - accuracy: 0.8623 - loss: 0.3221 - val_accuracy: 0.7649 - val_loss: 0.5267
Epoch 6/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 101ms/step - accuracy: 0.8955 - loss: 0.2552 - val_accuracy: 0.7636 - val_loss: 0.5418
Epoch 7/10

# Using Bert Tokenizer

In [None]:
!pip install tensorflow-hub
!pip install transformers torch
!pip install torch-xla



In [None]:
import numpy as np
import re
import nltk
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim

# Download required nltk data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words
default_stopwords = set(nltk.corpus.stopwords.words('english'))

# Preprocessing functions
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    words = nltk.word_tokenize(text)  # Tokenize text
    words = [word for word in words if word not in default_stopwords]  # Remove stopwords
    return " ".join(words)

# Load data
pos_reviews = open("rt-polarity.pos", "r", encoding="latin-1").readlines()
neg_reviews = open("rt-polarity.neg", "r", encoding="latin-1").readlines()

# Create labels
pos_labels = [1] * len(pos_reviews)
neg_labels = [0] * len(neg_reviews)

# Combine data
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels

# Preprocess all reviews
all_reviews = [preprocess_text(review) for review in all_reviews]

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(all_reviews, all_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets and DataLoaders
train_dataset = SentimentDataset(X_train, y_train)
val_dataset = SentimentDataset(X_val, y_val)
test_dataset = SentimentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Create a simple classifier using BERT
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)  # Binary classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Get pooled output for classification
        output = self.dropout(pooled_output)
        return self.fc(output)

# Initialize the model, optimizer, and loss function
classifier = SentimentClassifier()
optimizer = optim.Adam(classifier.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training Loop
def train_model(model, data_loader):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Validation Loop
def evaluate_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.numpy())
            true_labels.extend(labels.numpy())
    return accuracy_score(true_labels, predictions)

# Train and evaluate
for epoch in range(5):  # Fewer epochs to speed up the process
    train_model(classifier, train_loader)
    val_accuracy = evaluate_model(classifier, val_loader)
    print(f'Epoch {epoch + 1}, Validation Accuracy: {val_accuracy:.4f}')

# Test the model
test_accuracy = evaluate_model(classifier, test_loader)
print(f'Test Accuracy: {test_accuracy:.4f}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1, Validation Accuracy: 0.7980
Epoch 2, Validation Accuracy: 0.7799
Epoch 3, Validation Accuracy: 0.8080
Epoch 4, Validation Accuracy: 0.7980
Epoch 5, Validation Accuracy: 0.8155
Test Accuracy: 0.7981


# Checking preprocessing techniques

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter

# Download required nltk data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Original dataset (for comparison)
original_data = all_reviews[:5]  # Sample 5 reviews for demonstration

# Lowercase text
def to_lowercase(text):
    return text.lower()

# Remove special characters, punctuation, and numbers
def remove_special_chars(text):
    return re.sub(r"[^a-zA-Z\s]", "", text)

# Remove stop words
def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([word for word in words if word not in stop_words])

# Apply lemmatization
def lemmatize_text(text):
    words = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(word) for word in words])

# Apply stemming
def stem_text(text):
    words = word_tokenize(text)
    return " ".join([stemmer.stem(word) for word in words])

# Handle repeated characters (e.g., "woooow" -> "wow")
def remove_repeated_chars(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

# Remove short words
def remove_short_words(text, min_length=3):
    words = word_tokenize(text)
    return " ".join([word for word in words if len(word) >= min_length])

# Remove extra whitespaces
def remove_extra_whitespaces(text):
    return " ".join(text.split())

# Pipeline of preprocessing techniques
def preprocess_text(text, lowercase=True, special_chars=True, stopwords=True,
                    lemmatize=False, stem=False, repeated_chars=False,
                    short_words=False, whitespaces=True):

    if lowercase:
        text = to_lowercase(text)

    if special_chars:
        text = remove_special_chars(text)

    if stopwords:
        text = remove_stopwords(text)

    if lemmatize:
        text = lemmatize_text(text)

    if stem:
        text = stem_text(text)

    if repeated_chars:
        text = remove_repeated_chars(text)

    if short_words:
        text = remove_short_words(text)

    if whitespaces:
        text = remove_extra_whitespaces(text)

    return text

# Compare different preprocessing techniques
techniques = {
    "Original Data": original_data,
    "Lowercase Only": [preprocess_text(text, special_chars=False, stopwords=False) for text in original_data],
    "Remove Special Chars": [preprocess_text(text, stopwords=False) for text in original_data],
    "Remove Stop Words": [preprocess_text(text) for text in original_data],
    "Lemmatization": [preprocess_text(text, lemmatize=True) for text in original_data],
    "Stemming": [preprocess_text(text, stem=True) for text in original_data],
    "Remove Repeated Chars": [preprocess_text(text, repeated_chars=True) for text in original_data],
    "Remove Short Words": [preprocess_text(text, short_words=True) for text in original_data]
}

# Display comparison of techniques
for technique, processed_data in techniques.items():
    print(f"\n--- {technique} ---")
    for i, text in enumerate(processed_data):
        print(f"Text {i+1}: {text}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...



--- Original Data ---
Text 1: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

Text 2: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

Text 3: effective but too-tepid biopic

Text 4: if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

Text 5: emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


--- Lowercase Only ---
Text 1: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
Text 2: the gorgeously elaborate continuation of " the lord of the rings " trilo

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required nltk data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
default_stopwords = set(stopwords.words('english'))

# Define a custom stop word list: Remove common words but keep negation words
custom_stopwords = default_stopwords - {'not', 'no', 'nor'}

# Function to lowercase the text
def to_lowercase(text):
    return text.lower()

# Function to remove special characters and punctuation
def remove_special_chars(text):
    return re.sub(r"[^a-zA-Z\s]", "", text)

# Function to remove stop words, but keep important ones like "not"
def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([word for word in words if word not in custom_stopwords])

# Function to apply lemmatization
def lemmatize_text(text):
    words = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(word) for word in words])

# Final preprocessing pipeline
def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_special_chars(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

# Preprocess each text in the dataset
preprocessed_data = [preprocess_text(text) for text in original_data]

# Print results
for i, text in enumerate(preprocessed_data):
    print(f"Text {i+1}: {text}")

Text 1: rock destined st century new conan he going make splash even greater arnold schwarzenegger jeanclaud van damme steven segal
Text 2: gorgeously elaborate continuation lord ring trilogy huge column word not adequately describe cowriterdirector peter jackson expanded vision j r r tolkien middleearth
Text 3: effective tootepid biopic
Text 4: sometimes like go movie fun wasabi good place start
Text 5: emerges something rare issue movie thats honest keenly observed doesnt feel like one


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
