# Download dataset from https://ai.stanford.edu/~amaas/data/sentiment/

In [None]:
import tarfile
import os
filename = "aclImdb_v1.tar.gz"
extraction_path = "/content/dataset/"
os.makedirs(extraction_path, exist_ok=True)
with tarfile.open(filename, 'r:gz') as tar:
    tar.extractall(path=extraction_path)


In [None]:
import pandas as pd

# Function to load the dataset from a given folder
def load_dataset(folder):
    data = []
    for label in ["pos", "neg"]:
        label_path = os.path.join(extraction_path, "aclImdb", folder, label)
        for file in os.listdir(label_path):
            with open(os.path.join(label_path, file), "r", encoding="utf-8") as f:
                text = f.read()
                data.append({"text": text, "label": label})
    return pd.DataFrame(data)

# Load the training and testing datasets
train_data = load_dataset("train")
test_data = load_dataset("test")


In [None]:
# Display the first few rows of the training dataset
print("Training Dataset:")
print(train_data.head(1))

# Display the first few rows of the testing dataset
print("\nTesting Dataset:")
print(test_data.head())


Training Dataset:
                                                text label
0  Street Fight is a brilliant piece of brutal sa...   pos

Testing Dataset:
                                                text label
0  This film was original in an unoriginal way. A...   pos
1  An extremely dark and brooding show with an ex...   pos
2  First off, I absolutely loved this movie. As a...   pos
3  This is not the kind of movie that really meri...   pos
4  Julie Andrews and Rock Hudson were great in th...   pos


In [None]:
# Check the distribution of labels in the training dataset
print("Training Label Distribution:")
print(train_data['label'].value_counts())

# Check the distribution of labels in the testing dataset
print("\nTesting Label Distribution:")
print(test_data['label'].value_counts())


Training Label Distribution:
pos    12500
neg    12500
Name: label, dtype: int64

Testing Label Distribution:
pos    12500
neg    12500
Name: label, dtype: int64


In [None]:
# Display a positive review
print("Positive Review Example:")
print(train_data[train_data['label'] == 'pos'].iloc[0]['text'])

# Display a negative review
print("\nNegative Review Example:")
print(train_data[train_data['label'] == 'neg'].iloc[0]['text'])


Positive Review Example:
Street Fight is a brilliant piece of brutal satire. This is not a movie you just watch for fun. It is not a comfortable experience, although it does have some laugh-out-loud moments. This is a movie you watch when you need food for thought.<br /><br />To dismiss this film as simply racist is to miss the point entirely. This is not only a satire of Song of the South, it's also a biting commentary on the prejudices that Americans still have as a society. Every ethnic group portrayed in the movie gets shown as grotesque caricatures of their stereotypes, which in turn are grotesque caricatures of real people. Through this wild exaggeration, the filmmaker shows just how absurd these tightly-held beliefs really are.<br /><br />If you're the sort of person who's willing to acknowledge the ugliness of the prevalent prejudices American culture still holds, and if you're not afraid to look your own prejudices in the eye, this movie may be for you.

Negative Review Exampl

In [None]:
from bs4 import BeautifulSoup
import re  # Add this import for the 're' module
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

# Text cleaning function
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    return text

# Tokenization function
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Stopword removal function
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Stemming function
def stem_text(tokens):
    porter = PorterStemmer()
    return [porter.stem(token) for token in tokens]

# Apply preprocessing to the dataset
train_data['clean_text'] = train_data['text'].apply(clean_text)
train_data['tokens'] = train_data['clean_text'].apply(tokenize_text)
train_data['tokens'] = train_data['tokens'].apply(remove_stopwords)
train_data['stemmed_tokens'] = train_data['tokens'].apply(stem_text)

# Display the preprocessed data
print(train_data[['text', 'clean_text', 'tokens', 'stemmed_tokens']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()


                                                text  \
0  Street Fight is a brilliant piece of brutal sa...   
1  I just came back from the Late-night cinema an...   
2  I sat with my children as we watched this film...   
3  The Straight Story is the tale of an old man w...   
4  Okay, note to the people that put together the...   

                                          clean_text  \
0  street fight is a brilliant piece of brutal sa...   
1  i just came back from the latenight cinema and...   
2  i sat with my children as we watched this film...   
3  the straight story is the tale of an old man w...   
4  okay note to the people that put together thes...   

                                              tokens  \
0  [street, fight, brilliant, piece, brutal, sati...   
1  [came, back, latenight, cinema, indeed, silent...   
2  [sat, children, watched, film, found, entertai...   
3  [straight, story, tale, old, man, decides, vis...   
4  [okay, note, people, put, together, horror,

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the cleaned and preprocessed text data to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter based on your needs
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['clean_text'])

# Display the shape of the TF-IDF matrix
print("Shape of TF-IDF matrix:", X_train_tfidf.shape)

Shape of TF-IDF matrix: (25000, 5000)


In [None]:
# Apply preprocessing to the testing dataset
test_data['clean_text'] = test_data['text'].apply(clean_text)
test_data['tokens'] = test_data['clean_text'].apply(tokenize_text)
test_data['tokens'] = test_data['tokens'].apply(remove_stopwords)
test_data['stemmed_tokens'] = test_data['tokens'].apply(stem_text)

# Display the preprocessed testing data
print(test_data[['text', 'clean_text', 'tokens', 'stemmed_tokens']].head())


  text = BeautifulSoup(text, 'html.parser').get_text()


                                                text  \
0  This film was original in an unoriginal way. A...   
1  An extremely dark and brooding show with an ex...   
2  First off, I absolutely loved this movie. As a...   
3  This is not the kind of movie that really meri...   
4  Julie Andrews and Rock Hudson were great in th...   

                                          clean_text  \
0  this film was original in an unoriginal way al...   
1  an extremely dark and brooding show with an ex...   
2  first off i absolutely loved this movie as a b...   
3  this is not the kind of movie that really meri...   
4  julie andrews and rock hudson were great in th...   

                                              tokens  \
0  [film, original, unoriginal, way, although, ma...   
1  [extremely, dark, brooding, show, excellent, c...   
2  [first, absolutely, loved, movie, billy, cryst...   
3  [kind, movie, really, merits, critical, attent...   
4  [julie, andrews, rock, hudson, great, movie

In [None]:
# Display the columns in your test_data DataFrame
print(test_data.columns)
# Check if 'clean_text' is present in the columns of test_data
print('clean_text' in test_data.columns)


Index(['text', 'label', 'clean_text', 'tokens', 'stemmed_tokens'], dtype='object')
True


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Define and initialize nb_classifier
nb_classifier = MultinomialNB()

# Train the classifier using the TF-IDF vectors and corresponding labels
nb_classifier.fit(X_train_tfidf, train_data['label'])


# Transform the testing data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_data['clean_text'])

# Predict the labels for the testing data
predictions = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance of the model on the testing data
accuracy = accuracy_score(test_data['label'], predictions)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(test_data['label'], predictions))


Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

         neg       0.83      0.86      0.84     12500
         pos       0.85      0.82      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



ENSEMBLE METHODS : Random Forest and Gradient Boosting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Assuming you have the train_data and test_data DataFrames with 'clean_text' and 'label' columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['clean_text'], train_data['label'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features

# Transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Transform the testing data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict the labels for the testing data
rf_predictions = rf_classifier.predict(X_test_tfidf)

# Evaluate the performance of the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

# Display classification report for Random Forest
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.8348

Random Forest Classification Report:
              precision    recall  f1-score   support

         neg       0.83      0.84      0.83      2485
         pos       0.84      0.83      0.83      2515

    accuracy                           0.83      5000
   macro avg       0.83      0.83      0.83      5000
weighted avg       0.83      0.83      0.83      5000



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

# Assuming you have the train_data and test_data DataFrames with 'clean_text' and 'label' columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['clean_text'], train_data['label'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features

# Transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Create a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the Gradient Boosting classifier
gb_classifier.fit(X_train_tfidf, y_train)

# Transform the testing data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict the labels for the testing data
gb_predictions = gb_classifier.predict(X_test_tfidf)

model_filename = 'gradient_boosting_model.pkl'
joblib.dump(gb_classifier, model_filename)

# Save the TF-IDF vectorizer to a file
vectorizer_filename = 'tfidf_vectorizer.pkl'
joblib.dump(tfidf_vectorizer, vectorizer_filename)

# Evaluate the performance of the Gradient Boosting model
gb_accuracy = accuracy_score(y_test, gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)

# Display classification report for Gradient Boosting
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_predictions))


Gradient Boosting Accuracy: 0.8182

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         neg       0.86      0.76      0.81      2485
         pos       0.79      0.87      0.83      2515

    accuracy                           0.82      5000
   macro avg       0.82      0.82      0.82      5000
weighted avg       0.82      0.82      0.82      5000



# **Advanced neural network architecture** using TensorFlow's Keras API. In this case, we'll use a ***bidirectional LSTM layer***, which is a type of recurrent neural network (RNN).

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have the train_data and test_data DataFrames with 'clean_text' and 'label' columns

# Convert string labels to numerical format
label_mapping = {'neg': 0, 'pos': 1}
train_data['label_numeric'] = train_data['label'].map(label_mapping)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['clean_text'], train_data['label_numeric'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features

# Transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Tokenize and pad the sequences for neural network input
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=200)  # Adjust maxlen as needed

# Build a bidirectional LSTM neural network model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the neural network
model.fit(X_train_padded, y_train, epochs=5, batch_size=64)

# Transform the testing data using the same vectorizer and tokenization
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=200)

# Predict the probabilities for the testing data
lstm_probabilities = model.predict(X_test_padded)

# Convert probabilities to class predictions using a threshold (e.g., 0.5)
threshold = 0.5
lstm_predictions = (lstm_probabilities > threshold).astype(int)

# Evaluate the performance of the LSTM model
lstm_accuracy = accuracy_score(y_test, lstm_predictions)
print("LSTM Accuracy:", lstm_accuracy)

# Display classification report for LSTM
print("\nLSTM Classification Report:")
print(classification_report(y_test, lstm_predictions))

# Save the model
model.save('lstm_model.h5')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Accuracy: 0.853

LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      2485
           1       0.85      0.86      0.85      2515

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



  saving_api.save_model(
