In [11]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
data = pd.read_csv("IMDBDataset.csv")

# Data Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    # Tokenization and removing stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['processed_text'] = data['review'].apply(preprocess_text)

# Save the preprocessed dataset to a separate file
data.to_csv("preprocessed_imdb_reviews.csv", index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Subset making of the dataset**

In [46]:
# Load the preprocessed dataset
preprocessed_data = pd.read_csv("preprocessed_imdb_reviews.csv")

# Select a random subset of 1200 rows
subset_data = preprocessed_data.sample(n=5000, random_state=42)

# Save the subset dataset to a new file
subset_data.to_csv("subset_preprocessed_imdb_reviews.csv", index=False)


In [47]:
import pandas as pd

# Load the original dataset
original_data = pd.read_csv("IMDBDataset.csv")

# Print details of the original dataset
print("Original Dataset Details:")
print("Number of rows:", original_data.shape[0])
print("Number of columns:", original_data.shape[1])
print("\nFirst few rows:")
print(original_data.head())

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("preprocessed_imdb_reviews.csv")

# Print details of the preprocessed dataset
print("\nPreprocessed Dataset Details:")
print("Number of rows:", preprocessed_data.shape[0])
print("Number of columns:", preprocessed_data.shape[1])
print("\nFirst few rows:")
print(preprocessed_data.head())

#subset Dataset
subset_preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Print details of the preprocessed dataset
print("\nSubset Preprocessed Dataset Details:")
print("Number of rows:", subset_preprocessed_data.shape[0])
print("Number of columns:", subset_preprocessed_data.shape[1])
print("\nFirst few rows:")
print(subset_preprocessed_data.head())


Original Dataset Details:
Number of rows: 50000
Number of columns: 2

First few rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Preprocessed Dataset Details:
Number of rows: 50000
Number of columns: 3

First few rows:
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      processed_

**Future engineering count Vectorization**

In [48]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data
processed_text = preprocessed_data['processed_text']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Print the shape of the feature matrix
print("Shape of the feature matrix after Count Vectorization:", X.shape)


Shape of the feature matrix after Count Vectorization: (5000, 5000)


# **Count Vectorization**

# **for imbalanced**

**Logistic regerassion**

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train Logistic Regression model with increased max_iter
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(lr_model, X, labels, cv=5, scoring='f1_weighted', error_score='raise')
cv_accuracy_scores = cross_val_score(lr_model, X, labels, cv=5, scoring='accuracy', error_score='raise')

# Predict on test set
y_pred_test = lr_model.predict(X_test)

# Predict on train set
y_pred_train = lr_model.predict(X_train)

# Evaluate the model on test set
test_classification_report = classification_report(y_test, y_pred_test, output_dict=True, target_names=['negative', 'positive'])
test_f1_score = test_classification_report['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_classification_report = classification_report(y_train, y_pred_train, output_dict=True, target_names=['negative', 'positive'])
train_f1_score = train_classification_report['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Logistic Regression Model Results:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))



Logistic Regression Model Results:
-------------------------------------------------------
CV=N Report F1-Score: 0.8330
CV=N Accuracy: 0.8330
Test Set Report F1-Score: 0.8420
Test Set Accuracy: 0.8420
Train Set Report F1-Score: 0.9995
Train Set Accuracy: 0.9995


SVM **model**

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(svm_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(svm_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = svm_model.predict(X_test)

# Predict on train set
y_pred_train = svm_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Support Vector Machine (SVM) Model Results:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Support Vector Machine (SVM) Model Results:
-------------------------------------------------------
CV=N Report F1-Score: 0.8351
CV=N Accuracy: 0.8266
Test Set Report F1-Score: 0.8317
Test Set Accuracy: 0.8320
Train Set Report F1-Score: 0.9672
Train Set Accuracy: 0.9673


**Random Forest**

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(rf_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(rf_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = rf_model.predict(X_test)

# Predict on train set
y_pred_train = rf_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Random Forest Model Results:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Random Forest Model Results:
-------------------------------------------------------
CV=N Report F1-Score: 0.8340
CV=N Accuracy: 0.8308
Test Set Report F1-Score: 0.8510
Test Set Accuracy: 0.8510
Train Set Report F1-Score: 1.0000
Train Set Accuracy: 1.0000


**Gradient Boosting**

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(gb_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(gb_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = gb_model.predict(X_test)

# Predict on train set
y_pred_train = gb_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Gradient Boosting Model Results:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Gradient Boosting Model Results:
-------------------------------------------------------
CV=N Report F1-Score: 0.8198
CV=N Accuracy: 0.8088
Test Set Report F1-Score: 0.8374
Test Set Accuracy: 0.8380
Train Set Report F1-Score: 0.8661
Train Set Accuracy: 0.8665


# **TF/IDF Approach**

**Logistic regerassion**

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(lr_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(lr_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = lr_model.predict(X_test)

# Predict on train set
y_pred_train = lr_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Logistic Regression Model Results with TF-IDF approach:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Logistic Regression Model Results with TF-IDF approach:
-------------------------------------------------------
CV=N Report F1-Score: 0.8560
CV=N Accuracy: 0.8518
Test Set Report F1-Score: 0.8660
Test Set Accuracy: 0.8660
Train Set Report F1-Score: 0.9390
Train Set Accuracy: 0.9390


**SVM**

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(svm_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(svm_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = svm_model.predict(X_test)

# Predict on train set
y_pred_train = svm_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Support Vector Machine (SVM) Model Results with TF-IDF approach:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Support Vector Machine (SVM) Model Results with TF-IDF approach:
-------------------------------------------------------
CV=N Report F1-Score: 0.8573
CV=N Accuracy: 0.8528
Test Set Report F1-Score: 0.8640
Test Set Accuracy: 0.8640
Train Set Report F1-Score: 0.9965
Train Set Accuracy: 0.9965


**Random Forest**

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(rf_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(rf_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = rf_model.predict(X_test)

# Predict on train set
y_pred_train = rf_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Random Forest Model Results with TF-IDF approach:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Random Forest Model Results with TF-IDF approach:
-------------------------------------------------------
CV=N Report F1-Score: 0.8308
CV=N Accuracy: 0.8276
Test Set Report F1-Score: 0.8480
Test Set Accuracy: 0.8480
Train Set Report F1-Score: 1.0000
Train Set Accuracy: 1.0000


**Gradient boosting**

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Shuffle the dataset
preprocessed_data = shuffle(preprocessed_data, random_state=42)

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Cross-Validation Approach
cv_f1_scores = cross_val_score(gb_model, X, encoded_labels, cv=5, scoring='f1')
cv_accuracy_scores = cross_val_score(gb_model, X, encoded_labels, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = gb_model.predict(X_test)

# Predict on train set
y_pred_train = gb_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Gradient Boosting Model Results with TF-IDF approach:")
print("-------------------------------------------------------")
print("CV=N Report F1-Score: {:.4f}".format(cv_f1_scores.mean()))
print("CV=N Accuracy: {:.4f}".format(cv_accuracy_scores.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Gradient Boosting Model Results with TF-IDF approach:
-------------------------------------------------------
CV=N Report F1-Score: 0.8161
CV=N Accuracy: 0.8036
Test Set Report F1-Score: 0.8265
Test Set Accuracy: 0.8270
Train Set Report F1-Score: 0.8958
Train Set Accuracy: 0.8960


# **Balanced Dataset**

# **Future Engineering TF-IDF**

In [57]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data
processed_text = preprocessed_data['processed_text']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000


# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Print the shape of the feature matrix
print("Shape of the feature matrix after TF-IDF Vectorization:", X.shape)


Shape of the feature matrix after TF-IDF Vectorization: (5000, 5000)


**Logistic regression**

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = lr_model.predict(X_test)

# Predict on train set
y_pred_train = lr_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Logistic Regression Model Results with Count Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Logistic Regression Model Results with Count Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.7643
CV=N Train Accuracy: 0.7648
Test Set Report F1-Score: 0.7645
Test Set Accuracy: 0.7647
Train Set Report F1-Score: 1.0000
Train Set Accuracy: 1.0000


# **SVM**

In [39]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = svm_model.predict(X_test)

# Predict on train set
y_pred_train = svm_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Support Vector Machine (SVM) Model Results with Count Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Support Vector Machine (SVM) Model Results with Count Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.7089
CV=N Train Accuracy: 0.7104
Test Set Report F1-Score: 0.7337
Test Set Accuracy: 0.7353
Train Set Report F1-Score: 0.9777
Train Set Accuracy: 0.9777


# **Random Forest**

In [40]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = rf_model.predict(X_test)

# Predict on train set
y_pred_train = rf_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Random Forest Model Results with Count Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Random Forest Model Results with Count Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.7739
CV=N Train Accuracy: 0.7673
Test Set Report F1-Score: 0.7833
Test Set Accuracy: 0.7843
Train Set Report F1-Score: 1.0000
Train Set Accuracy: 1.0000


# **Gradient Boosting**

In [58]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform Count Vectorization
X = count_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = gb_model.predict(X_test)

# Predict on train set
y_pred_train = gb_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Gradient Boosting Model Results with Count Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Gradient Boosting Model Results with Count Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.8048
CV=N Train Accuracy: 0.8052
Test Set Report F1-Score: 0.8204
Test Set Accuracy: 0.8214
Train Set Report F1-Score: 0.8695
Train Set Accuracy: 0.8697


# **TF-IDF Approach**

 **Logistic Regression**

In [64]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = lr_model.predict(X_test)

# Predict on train set
y_pred_train = lr_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Logistic Regression Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Logistic Regression Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.8562
CV=N Train Accuracy: 0.8563
Test Set Report F1-Score: 0.8550
Test Set Accuracy: 0.8552
Train Set Report F1-Score: 0.9347
Train Set Accuracy: 0.9347


**SVM**

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = svm_model.predict(X_test)

# Predict on train set
y_pred_train = svm_model.predict(X_train)

# Classification report for test set
test_classification_report = classification_report(y_test, y_pred_test)

# Confusion matrix for test set
test_confusion_matrix = confusion_matrix(y_test, y_pred_test)

# Classification report for train set
train_classification_report = classification_report(y_train, y_pred_train)

# Confusion matrix for train set
train_confusion_matrix = confusion_matrix(y_train, y_pred_train)

# Print Results
print("Support Vector Machine (SVM) Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Classification Report:")
print(test_classification_report)
print("Test Set Confusion Matrix:")
print(test_confusion_matrix)
print("Train Set Classification Report:")
print(train_classification_report)
print("Train Set Confusion Matrix:")
print(train_confusion_matrix)


Support Vector Machine (SVM) Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.8582
CV=N Train Accuracy: 0.8583
Test Set Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.82      0.84       491
    positive       0.84      0.88      0.86       517

    accuracy                           0.85      1008
   macro avg       0.85      0.85      0.85      1008
weighted avg       0.85      0.85      0.85      1008

Test Set Confusion Matrix:
[[402  89]
 [ 60 457]]
Train Set Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      2028
    positive       1.00      1.00      1.00      2002

    accuracy                           1.00      4030
   macro avg       1.00      1.00      1.00      4030
weighted avg       1.00      1.00      1.00      4030

Train Set

**Random Forest**

In [62]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = rf_model.predict(X_test)

# Predict on train set
y_pred_train = rf_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Random Forest Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Random Forest Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.8240
CV=N Train Accuracy: 0.8280
Test Set Report F1-Score: 0.8343
Test Set Accuracy: 0.8343
Train Set Report F1-Score: 1.0000
Train Set Accuracy: 1.0000


 **Gradient Boosting**

In [61]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Load the preprocessed dataset
preprocessed_data = pd.read_csv("subset_preprocessed_imdb_reviews.csv")

# Extract the preprocessed text data and labels
processed_text = preprocessed_data['processed_text']
labels = preprocessed_data['sentiment']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting the number of features to 5000

# Perform TF-IDF Vectorization
X = tfidf_vectorizer.fit_transform(processed_text)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, labels)

# Train/Test Split on balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Cross-validation for train set
cv_f1_scores_train = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='f1_weighted')
cv_accuracy_scores_train = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='accuracy')

# Predict on test set
y_pred_test = gb_model.predict(X_test)

# Predict on train set
y_pred_train = gb_model.predict(X_train)

# Evaluate the model on test set
test_f1_score = classification_report(y_test, y_pred_test, output_dict=True)['weighted avg']['f1-score']
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model on train set
train_f1_score = classification_report(y_train, y_pred_train, output_dict=True)['weighted avg']['f1-score']
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print Results
print("Gradient Boosting Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:")
print("-------------------------------------------------------")
print("CV=N Train Report F1-Score: {:.4f}".format(cv_f1_scores_train.mean()))
print("CV=N Train Accuracy: {:.4f}".format(cv_accuracy_scores_train.mean()))
print("Test Set Report F1-Score: {:.4f}".format(test_f1_score))
print("Test Set Accuracy: {:.4f}".format(test_accuracy))
print("Train Set Report F1-Score: {:.4f}".format(train_f1_score))
print("Train Set Accuracy: {:.4f}".format(train_accuracy))


Gradient Boosting Model Results with TF-IDF Vectorization and Balanced Dataset using SMOTE:
-------------------------------------------------------
CV=N Train Report F1-Score: 0.8010
CV=N Train Accuracy: 0.8010
Test Set Report F1-Score: 0.8106
Test Set Accuracy: 0.8115
Train Set Report F1-Score: 0.8939
Train Set Accuracy: 0.8940
