In [3]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Load the data
data = pd.read_csv("./IMDB Dataset.csv")

# Drop duplicates
data.drop_duplicates(subset=['review'], inplace=True)
data.reset_index(drop=True, inplace=True)

# Convert reviews to lowercase
data['review'] = data['review'].str.lower()

# Remove HTML tags
def strip_html(text):
    clean_text = re.sub('<.*?>', '', text)
    return clean_text

data['review'] = data['review'].apply(lambda x: strip_html(x))

# Tokenize, remove stopwords and lemmatize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """
    Map POS tag to first character used by WordNetLemmatizer
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # Tokenize text
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Get POS tags
    tagged_words = pos_tag(words)
    # Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_words]
    # Join words back into a string
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text

data['review'] = data['review'].apply(preprocess_text)

# Split data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data['review'], data['sentiment'], test_size=0.3, random_state=1)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

# Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_vectors = tfidf_vectorizer.fit_transform(train_data)
test_vectors = tfidf_vectorizer.transform(test_data)

# Train and evaluate models
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(C=1.0, kernel='linear', degree=3, gamma='auto'),
    'Logistic Regression': LogisticRegression(solver='lbfgs')
}

for name, model in models.items():
    # Train the model
    model.fit(train_vectors, train_labels)
    # Make predictions on test set
    preds = model.predict(test_vectors)
    # Evaluate performance
    acc = accuracy_score(test_labels, preds)
    print(f'{name} accuracy: {acc:.4f}')
    print(f'Classification Report:\n{classification_report(test_labels, preds)}')
    print(f'Confusion Matrix:\n{confusion_matrix(test_labels, preds)}\n')

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'