<a href="https://colab.research.google.com/github/Tu2k1/IMDB-Reviews-Classification/blob/main/IMDB_Reviews_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Test Differnet Algorithms**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK stopwords data if not already downloaded
nltk.download("stopwords")
nltk.download("punkt")



file = '/content/imdb_reviews.csv'

# upload the dataset
data = pd.read_csv(file, encoding="ISO-8859-1")

# Split the dataset into training and testing sets
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the text data using TF-IDF vectorization, stop words removal, and stemming
vectorizer = TfidfVectorizer()

# Create a stemmer object and define stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

# Define a function for text cleaning (stop words removal and stemming)
def clean_text(text):
    words = word_tokenize(text)
    cleaned_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(cleaned_words)

X_train = X_train.apply(clean_text)
X_test = X_test.apply(clean_text)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# List of algorithms with tuned hyperparameters
algorithms = {
    'Perceptron': Perceptron(alpha=0.0001, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(max_depth=None, min_samples_split=2),
    'Naive Bayes': MultinomialNB(alpha=1.0),
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=1000),
}

# Test each algorithm and print accuracy, precision, recall, F1-score, and AUC
for algo_name, model in algorithms.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    if hasattr(model, 'predict_proba'):
        # For models that have predict_proba, use it for AUC calculation
        auc = roc_auc_score(y_test, model.predict_proba(X_test_vec)[:, 1])
    else:
        # For Perceptron, use decision_function for AUC calculation
        decision_values = model.decision_function(X_test_vec)
        auc = roc_auc_score(y_test, decision_values)

    print(f"{algo_name} Metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print(f"AUC: {auc}")
    print("\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Perceptron Metrics:
Accuracy: 0.8644
Precision: 0.857364641956142
Recall: 0.8767612621551896
F1-Score: 0.8669544740973312
AUC: 0.938869480819213


Decision Tree Metrics:
Accuracy: 0.7238
Precision: 0.7276544691061788
Recall: 0.72216709664616
F1-Score: 0.7249003984063745
AUC: 0.7238128367729892


Naive Bayes Metrics:
Accuracy: 0.8605
Precision: 0.8700243704305443
Recall: 0.8501686842627505
F1-Score: 0.8599819331526648
AUC: 0.9353713079903782


Logistic Regression Metrics:
Accuracy: 0.8904
Precision: 0.8787704130643612
Recall: 0.9077197856717603
F1-Score: 0.8930105427567357
AUC: 0.9599935260061223




# **Chosen algorithm for the model**

In [3]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


# **Testing new data**

In [4]:
# Preprocess the new data
new_data = [
    "I loved the movie. It had great acting and a compelling story.",
    "The movie was terrible. The acting was bad and the plot was boring.",
    "The performances in the movie were outstanding. Truly remarkable.",
    "I couldn't stand the movie. It was so disappointing.",
    "The movie exceeded my expectations. It was fantastic!",
    "I regret watching the movie. Waste of time.",
    "The movie was bad!"
]

# text cleaning (stop words removal and stemming)

cleaned_new_data = [clean_text(text) for text in new_data]

new_data_vec = vectorizer.transform(cleaned_new_data)  # Preprocess the new data using the same vectorizer

# Use the trained model for prediction
new_data_pred = model.predict(new_data_vec)  # Predict the sentiment labels for the new data

sentiments = {
    0 : 'Negative',
    1 : 'Positive'
}
# Print the predictions
for data, pred in zip(new_data, new_data_pred):
    print("Review:", data)
    print("Sentiment:", sentiments[pred])
    print()

Review: I loved the movie. It had great acting and a compelling story.
Sentiment: Positive

Review: The movie was terrible. The acting was bad and the plot was boring.
Sentiment: Negative

Review: The performances in the movie were outstanding. Truly remarkable.
Sentiment: Positive

Review: I couldn't stand the movie. It was so disappointing.
Sentiment: Negative

Review: The movie exceeded my expectations. It was fantastic!
Sentiment: Positive

Review: I regret watching the movie. Waste of time.
Sentiment: Negative

Review: The movie was bad!
Sentiment: Negative

