In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [3]:
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [4]:
def preprocess_text(text):
    """Clean and preprocesess data by removing unwanted characters, tokenizing, removing stopwords, and lemmatizing."""
    if pd.isna(text):
        return ""
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'[^a-zA-Z!?\']', ' ', text) # Remove special characters  
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    negative_words = {'no', 'not', 'never', "n't", 'neither', 'nor', 'none'}
    stop_words -= negative_words  # Retain negative words for sentiment analysis
    tokens = [lemmatizer.lemmatize(token)
              for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [5]:
def create_sentiment_labels(rating):
    """Convert numerical ratings into sentiment labels (positive, neutral, negative)."""
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

In [6]:
def prepare_data(df):
    """Prepare data by handling missing values, creating sentiment labels, and processing text."""
    df = df.dropna(subset=['reviewText'])  # Drop rows with missing review texts
    
    # Assume missing ratings as neutral
    df['overall'] = df['overall'].fillna(3)
    df['sentiment'] = df['overall'].apply(create_sentiment_labels)
    df['processed_text'] = df['reviewText'].apply(preprocess_text)
    return df