<a href="https://colab.research.google.com/github/Tommy-Adisa/Sentiment-Analysis-For-Customer-Review-using-RoBBETa-Transformer-Model/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPROVING BUSINESS DECISIONS THROUGH SENTIMENT ANALYSIS OF CUSTOMER REVIEWS USING NLP

### LIBRARY IMPORTATIONS

In [None]:

#pip install contractions

In [None]:
#pip install pyspellchecker


In [None]:
#pip install fsspec==2023.5.0 s3fs huggingface-hub

In [None]:
#pip install datasets pandas nltk scikit-learn seaborn matplotlib


In [None]:
pip install transformers[torch]


In [None]:
pip install accelerate -U


In [None]:
#pip install --upgrade torch typing_extensions

In [None]:
#pip install --upgrade typing_extensions


In [None]:
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import torch
import numpy as np
import seaborn as sns
import contractions  # Import contractions library

from spellchecker import SpellChecker #Import spellcheck library. this handles errors in spellings
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset


In [None]:
# This is to downlod nltk resources
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
#This is to import data from the dataset

df= pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [None]:
#This is to check the first 7 head of the data
df.head(7)

In [None]:
#This is to output the dimension of a dataFrame
print(df.shape)

In [None]:
#this is to show the descriptive statistics of the numerical columns of this dataFrame
df.describe()

In [None]:
#this is to get the information of the dataframe
df.info()

In [None]:
# checking the columns
print(df.columns)

###  Handling missing values

In [None]:
# Drop missing values in 'reviews.text' column
df.dropna(subset=['reviews.text'], inplace=True)

# Convert all reviews to string format
df['reviews.text'] = df['reviews.text'].astype(str)

# Check for missing values again
print(df.isnull().sum())

In [None]:
df.drop(columns=["reviews.didPurchase", "reviews.id", "reviews.numHelpful"], inplace=True)

In [None]:
df["reviews.doRecommend"].fillna("unknown", inplace=True)

In [None]:
######df["reviews.username"].fillna("anonymous", inplace=True)

In [None]:
print(df.isnull().sum())  # Should return 0 missing values

## Data Preprocessing

#### To ensure the accuracy of sentiment analysis, the raw text data will undergo preprocessing. This involves:

    Text Cleaning: Removing special characters, punctuation, and unnecessary symbols.

    Tokenisation: Splitting the text into individual words or phrases.

    Lemmatization: Converting words into their base forms (convert to lowercase) this is to improve NLP efficiency.

    Stopword Removal: Eliminating common words that do not contribute to sentiment (e.g., "the," "is," "and").
    
    Handling Contractions – Expand contractions (e.g., "don't" → "do not").

    Handling Imbalanced Data: If necessary, balance the dataset by oversampling underrepresented classes or undersampling overrepresented ones

    Handle misspellings or slang (optional).

This step standardises the data, making it more suitable for computational analysis.

### Text Cleaning

#### The goal is to remove special characters, punctuation, and unnecessary symbols.

In [None]:
# Define the text preprocessing function
def preprocess_text(text):
    # Remove non-alphabetical characters and make everything lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)  # Remove digits, punctuation
    text = contractions.fix(text) #Expand contractions
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip() #this is to remove extral spaces

    # Tokenize the text (split into words)
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize each token
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Reconstruct the cleaned text
    return ' '.join(tokens)

# Apply preprocessing to the 'reviews.text' column
df['Cleaned_Review'] = df['reviews.text'].apply(preprocess_text)




In [None]:
# Display original vs cleaned text
df[['reviews.text', 'Cleaned_Review']].head(10)


### Sentiment Analysis

I am comparing using VADER sentiment Analyzer and SentiWordNEt

In [None]:
# This is to Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Function to analyze sentiment using VADER
def get_vader_sentiment(text):
    score = sia.polarity_scores(str(text))['compound']
    return 'Positive' if score > 0.05 else 'Negative' if score < -0.05 else 'Neutral'

# Apply VADER sentiment analysis
df['VADER_Sentiment'] = df['Cleaned_Review'].apply(get_vader_sentiment)

In [None]:
# Function to get SentiWordNet sentiment score
def get_sentiwordnet_sentiment(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    pos_score = 0
    neg_score = 0
    count = 0

    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            synsets = list(swn.senti_synsets(word, wn_tag))
            if synsets:
                pos_score += synsets[0].pos_score()
                neg_score += synsets[0].neg_score()
                count += 1

    if count == 0:
        return 'Neutral'

    avg_score = (pos_score - neg_score) / count
    return 'Positive' if avg_score > 0.05 else 'Negative' if avg_score < -0.05 else 'Neutral'



In [None]:
# Helper function to convert POS tags to WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Apply SentiWordNet sentiment analysis
df['SentiWordNet_Sentiment'] = df['Cleaned_Review'].apply(get_sentiwordnet_sentiment)



In [None]:
# Display sentiment comparison
sns.countplot(x='VADER_Sentiment', data=df, palette='coolwarm')
plt.title('Sentiment Analysis using VADER')
plt.show()



In [None]:
sns.countplot(x='SentiWordNet_Sentiment', data=df, palette='coolwarm')
plt.title('Sentiment Analysis using SentiWordNet')
plt.show()

# Save results
df.to_csv('results/sentiment_analysis_comparison.csv', index=False)


In [None]:

# Display first few rows after sentiment analysis
df[['reviews.text', 'Cleaned_Review', 'VADER_Sentiment', 'SentiWordNet_Sentiment']].head(50)

In [None]:
# Machine Learning-Based Sentiment Classification
# Convert categorical labels to numerical values
df['Sentiment_Label'] = df['VADER_Sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})


df['Sent_Label_SentiWordNEt'] = df['SentiWordNet_Sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})


df[['reviews.text', 'Cleaned_Review', 'VADER_Sentiment', 'SentiWordNet_Sentiment', 'Sentiment_Label', 'Sent_Label_SentiWordNEt']].head(10)


In [None]:
# Feature extraction using TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Review'])
y = df['Sentiment_Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Using GPT 4

In [None]:
#pip install openai

### The goal is to Training my Own Sentiment Analysis Model


### Steps to use
#### We will start with a pre-trained model (like BERT or RoBERTa) and fine-tune it on this dataset.

##### Steps:
##### 1. Load a pretrained transformer model (BERT, RoBERTa).
##### 2. Train the model using PyTorch or TensorFlow.
##### 3. Evaluate accuracy

In [None]:
# Select the relevant column
df = df[['reviews.text', 'reviews.rating']]

# Drop missing values
df.dropna(inplace=True)

# Display some rows
df.head()

In [None]:
def assign_sentiment(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

# Apply sentiment function
df['sentiment'] = df['reviews.rating'].apply(assign_sentiment)

# Keep only needed columns
df = df[['reviews.text', 'sentiment']]

# Show class distribution
df['sentiment'].value_counts()


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-uncased"  # or use "roberta-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize dataset
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Example of tokenization
sample_text = df['reviews.text'][0:3].tolist()
tokenized_sample = tokenize_data(sample_text)
tokenized_sample


In [None]:
import torch
from torch.utils.data import Dataset

class AmazonReviewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset instance
dataset = AmazonReviewsDataset(df['reviews.text'].tolist(), df['sentiment'].tolist(), tokenizer)

# Check sample
dataset[0]


In [None]:
from torch.utils.data import DataLoader, random_split

# Define dataset sizes
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Split dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
from transformers import AutoModelForSequenceClassification

# Load the model (3 classes: negative, neutral, positive)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


In [None]:
from transformers import Trainer, TrainingArguments

# Training settings
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()
