<a href="https://colab.research.google.com/github/Thomasjoseph2/moview_review_classifier/blob/main/movie_review_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
   # Import required libraries
import os
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
# loding data
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz


--2024-07-03 14:06:38--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-07-03 14:06:46 (9.74 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Define stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [5]:
# Function to clean and process text
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return " ".join(filtered_tokens)

In [6]:
# Function to load data from a folder into a DataFrame
def load_data_from_folder(folder_path, label):
    file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]
    reviews = [open(file_path, 'r', encoding='utf-8').read() for file_path in file_paths]
    df = pd.DataFrame({'review': reviews, 'label': label})
    return df

In [7]:
#Load data into DataFrames
pos_folder = os.path.join('aclImdb', 'train', 'pos')
neg_folder = os.path.join('aclImdb', 'train', 'neg')

#function call
pos_df = load_data_from_folder(pos_folder, 1)
neg_df = load_data_from_folder(neg_folder, 0)

In [8]:
print(pos_df.head())
print(neg_df.head())

                                              review  label
0  This is a hard-boiled Warner Brothers film sta...      1
1  Robert Mitchum stars as Clint Tollinger in thi...      1
2  Definitely one of the most witty and twisted w...      1
3  I saw Dick Tracy when I was very young. I didn...      1
4  When I was flicking through the TV Guide, and ...      1
                                              review  label
0  With a title "borrowed" from Werner Herzog and...      0
1  My girlfriend once brought around The Zombie C...      0
2  This is the most depressing film I have ever s...      0
3  Eight academy nominations? It's beyond belief....      0
4  'The Omen 4: The Awakening' is a made-for-tele...      0


In [9]:
# Combine positive and negative DataFrames
df = pd.concat([pos_df, neg_df], ignore_index=True)

In [10]:
# Clean text data
df['cleaned_review'] = df['review'].apply(clean_text)

In [11]:
# Vectorize text data
vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # Use bigrams for richer feature extraction
features = vectorizer.fit_transform(df['cleaned_review'])

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, df['label'], test_size=0.2)

In [14]:
print(X_train.shape)
print(X_test.shape)

(20000, 4531868)
(5000, 4531868)


In [None]:
# prompt: why the shape contains (20000, 4531868) is it ok to have this much in a good model

# The shape of the data is (20000, 4531868) because there are 20,000 data points (rows) and 4,531,868 features (columns).

# It is not necessarily bad to have a large number of features in a model, but it can make the model more complex and computationally expensive to train.

# There are a few things to consider when deciding whether or not a large number of features is appropriate for a model:

# - The size of the dataset: A large number of features can be more appropriate for a large dataset, as there is more data to support the model.
# - The complexity of the problem: A large number of features can be more appropriate for a complex problem, as it allows the model to capture more nuanced relationships in the data.
# - The computational resources available: Training a model with a large number of features can be computationally expensive, so it is important to consider the available resources.

# In this case, the dataset is relatively large (20,000 data points) and the problem is complex (sentiment analysis), so it is not unreasonable to have a large number of features.

# However, it is important to note that a large number of features can also lead to overfitting, which is when the model learns the training data too well and does not generalize well to new data.

# Therefore, it is important to carefully evaluate the model and ensure that it is not overfitting before deploying it.


In [16]:

# Define parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}
# Initialize and run GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate model accuracy on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy with GridSearchCV: {accuracy}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'C': 10, 'solver': 'lbfgs'}
Logistic Regression Accuracy with GridSearchCV: 0.8778


In [17]:
# Train a logistic regression model
# model = LogisticRegression()
# model.fit(X_train, y_train)

In [18]:
# Evaluate model accuracy
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")

Accuracy: 0.8606


In [19]:

# Load test data into DataFrames
pos_test_folder = os.path.join('aclImdb', 'test', 'pos')
neg_test_folder = os.path.join('aclImdb', 'test', 'neg')

# Load data into DataFrames
pos_test_df = load_data_from_folder(pos_test_folder, 1)
neg_test_df = load_data_from_folder(neg_test_folder, 0)

# Combine positive and negative DataFrames
test_df = pd.concat([pos_test_df, neg_test_df], ignore_index=True)

# Clean text data
test_df['cleaned_review'] = test_df['review'].apply(clean_text)

# Vectorize test data
test_features = vectorizer.transform(test_df['cleaned_review'])

# Predict on test data
y_pred_test = best_model.predict(test_features)

# Evaluate model accuracy on the test set
accuracy_test = accuracy_score(test_df['label'], y_pred_test)
print(f"Logistic Regression Accuracy on Test Data: {accuracy_test}")


Logistic Regression Accuracy on Test Data: 0.88172
