In [None]:
!pip install xgboost


In [10]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer  # Using Lemmatizer instead of Porter Stemmer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Download NLTK data
nltk.download('punkt')  # Ensure punkt is downloaded
nltk.download('stopwords')  # Ensure stopwords are downloaded
nltk.download('wordnet')  # Ensure WordNet is downloaded for lemmatization

# Load the dataset
dataset_path = r"C:\Users\lemrk\Music\Sentiment-Analysis\Instruments_Reviews.csv"
dataset = pd.read_csv(dataset_path)

# Data Preprocessing
dataset["reviewText"] = dataset["reviewText"].fillna(value="")  # Fill missing reviewText
dataset["reviews"] = dataset["reviewText"] + " " + dataset["summary"]  # Combine reviewText and summary
dataset = dataset.drop(columns=["reviewText", "summary"], axis=1)  # Drop unnecessary columns

# Labelling Products Based On Ratings Given
def Labelling(Rows):
    if Rows["overall"] > 3.0:
        Label = "Positive"
    elif Rows["overall"] < 3.0:
        Label = "Negative"
    else:
        Label = "Neutral"
    return Label

dataset["sentiment"] = dataset.apply(Labelling, axis=1)

# Text Preprocessing
def Text_Cleaning(Text):
    Text = Text.lower()  # Convert to lowercase
    punc = str.maketrans(string.punctuation, ' '*len(string.punctuation))  # Remove punctuation
    Text = Text.translate(punc)
    Text = re.sub(r'\d+', '', Text)  # Remove numbers
    Text = re.sub(r'https?://\S+|www\.\S+', '', Text)  # Remove URLs
    Text = re.sub('\n', '', Text)  # Remove newlines
    return Text

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Stopwords
Stopwords = set(nltk.corpus.stopwords.words("english")) - set(["not", "no", "nor"])  # Keep negation words
Stopwords = list(Stopwords)  # Convert set to list for CountVectorizer

def Text_Processing(Text):
    Processed_Text = list()
    Tokens = nltk.word_tokenize(Text)  # Tokenize the text
    for word in Tokens:
        if word not in Stopwords:
            Processed_Text.append(lemmatizer.lemmatize(word))  # Use Lemmatizer
    return " ".join(Processed_Text)

# Apply text cleaning and processing
dataset["reviews"] = dataset["reviews"].apply(lambda Text: Text_Cleaning(Text))
dataset["reviews"] = dataset["reviews"].apply(lambda Text: Text_Processing(Text))

# Feature Engineering
Columns = ["reviewerID", "overall"]
dataset = dataset.drop(columns=Columns, axis=1)  # Drop unnecessary columns

# Encoding Our Target Variable
Encoder = LabelEncoder()
dataset["sentiment"] = Encoder.fit_transform(dataset["sentiment"])

# TF-IDF Vectorizer with Better Parameters
TF_IDF = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words=Stopwords)
X = TF_IDF.fit_transform(dataset["reviews"])
y = dataset["sentiment"]

# Print the shape of the vectorized data
print(f"Shape of vectorized data: {X.shape}")

# Splitting Our Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Hyperparameter Tuning for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters for Logistic Regression:", grid_search.best_params_)

# Train Logistic Regression with Best Parameters
best_logreg = grid_search.best_estimator_
y_pred_logreg = best_logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))

# Train XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Train Random Forest
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
y_pred_rforest = rforest.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rforest))

# Classification Report for Logistic Regression
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_logreg, target_names=["Negative", "Neutral", "Positive"]))

# Confusion Matrix for Logistic Regression
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
print("Confusion Matrix for Logistic Regression:")
print(conf_matrix)

# Save the Logistic Regression model
with open('logreg_model.pkl', 'wb') as f:
    pickle.dump(best_logreg, f)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(TF_IDF, f)

# Save the Label Encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(Encoder, f)

print("Models saved successfully!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lemrk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lemrk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lemrk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Shape of vectorized data: (10261, 10000)
Best Parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
Logistic Regression Accuracy: 0.8904910366328916
XGBoost Accuracy: 0.886983632112237
Random Forest Accuracy: 0.8819173811379579

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    Negative       0.59      0.19      0.29       120
     Neutral       0.48      0.21      0.29       189
    Positive       0.91      0.98      0.95      2257

    accuracy                           0.89      2566
   macro avg       0.66      0.46      0.51      2566
weighted avg       0.86      0.89      0.87      2566

Confusion Matrix for Logistic Regression:
[[  23   16   81]
 [   8   40  141]
 [   8   27 2222]]
Models saved successfully!
