In [22]:
import joblib
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from google.colab import files

In [59]:
import nltk
nltk.download("punkt")                       # already present but harmless
nltk.download("punkt_tab")                   # new
nltk.download("averaged_perceptron_tagger")  # already present but harmless
nltk.download("averaged_perceptron_tagger_eng")   # new
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print("upload the model xgboost_classifier.joblib file")
uploaded = files.upload()

upload the model xgboost_classifier.joblib file


Saving xgboost_classifier.joblib to xgboost_classifier.joblib


In [4]:
print("upload the tf_idf vectoriser.joblib file")
uploaded = files.upload()

upload the tf_idf vectoriser.joblib file


Saving tfidf_vectorizer.joblib to tfidf_vectorizer.joblib


In [5]:
print("upload the numeric scaler.joblib file")
uploaded = files.upload()

upload the numeric scaler.joblib file


Saving numeric_scaler.joblib to numeric_scaler.joblib


In [6]:
print("upload the new dataset here: ")
uploaded = files.upload()

upload the new dataset here: 


Saving new_review_dataset.csv to new_review_dataset.csv


In [60]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [61]:
vectorizer = joblib.load("tfidf_vectorizer.joblib")
scaler     = joblib.load("numeric_scaler.joblib")
model      = joblib.load("xgboost_classifier.joblib")

print("Loaded vectorizer, scaler, and model.")

Loaded vectorizer, scaler, and model.


In [62]:
def preprocess_review(text):
    # Clean
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    # Remove stopwords
    words = [w for w in text.split() if w not in stop_words]
    # Lemmatize
    def pos(t):
        tag = t[1][0].upper()
        return {"J":wordnet.ADJ,"N":wordnet.NOUN,"V":wordnet.VERB,"R":wordnet.ADV}.get(tag, wordnet.NOUN)
    tokens = word_tokenize(" ".join(words))
    return " ".join(lemmatizer.lemmatize(w, pos(t)) for w, t in zip(tokens, nltk.pos_tag(tokens)))

In [63]:
def extract_topic_features(text):
    t = text.lower()
    return {
        "has_customer_service": int(any(k in t for k in ["service","support","customer"])),
        "has_delivery_issue":   int(any(k in t for k in ["delivery","shipping","late"])),
        "has_account_issue":    int(any(k in t for k in ["account","login","password"])),
        "has_refund_issue":     int(any(k in t for k in ["refund","return","money"]))
    }

In [67]:
def predict_sentiment_row(row):
    # 1) Preprocess text
    raw = str(row["Review"])
    proc = preprocess_review(raw)

    # 2) Topic flags
    topics = extract_topic_features(proc)

    # 3) Helpfulness
    up, down = int(row["Review Upvotes"]), int(row["Review Downvotes"])
    help_score = up/(up+down) if (up+down)>0 else 0

    # 4) Recompute the 4 missing numeric features:
    review_length     = len(proc)                          # length of preprocessed text
    caps_ratio        = sum(1 for c in raw if c.isupper()) / max(len(raw), 1)
    exclamation_count = raw.count("!")
    question_count    = raw.count("?")

    # 5) Vectorize & scale
    X_text = vectorizer.transform([proc]).toarray()

    # Now build the full 8‑feature numeric array in the same order you trained on:
    num_arr = np.array([[
        topics["has_customer_service"],
        topics["has_delivery_issue"],
        topics["has_account_issue"],
        topics["has_refund_issue"],
        review_length,
        caps_ratio,
        exclamation_count,
        question_count
    ]])

    X_num  = scaler.transform(num_arr)    # scaler expects 8 features
    X      = np.hstack([X_text, X_num])

    # 6) Predict
    lbl   = model.predict(X)[0]
    proba = model.predict_proba(X)[0]

    return pd.Series({
        "pred_label":    {0:"Negative",1:"Neutral",2:"Positive"}[lbl],
        "prob_negative": proba[0],
        "prob_neutral":  proba[1],
        "prob_positive": proba[2]
    })

In [68]:
df_new = pd.read_csv("new_review_dataset.csv")

In [70]:
results = df_new.apply(predict_sentiment_row, axis=1)

In [71]:
df_out = pd.concat([df_new, results], axis=1)

In [72]:
# Show all reviews with predicted sentiment
df_out[[
    "Review","Verified Purchaser","Recommended Purchase",
    "pred_label","prob_negative","prob_neutral","prob_positive"
]]

Unnamed: 0,Review,Verified Purchaser,Recommended Purchase,pred_label,prob_negative,prob_neutral,prob_positive
0,I absolutely love this product—works like a ch...,Yes,Yes,Positive,0.004701,0.001497,0.993802
1,It arrived broken and customer service was ter...,No,No,Negative,0.453939,0.342725,0.203336
2,"Decent value for the price, but battery life c...",Yes,No,Positive,0.017616,0.076512,0.905872
3,Not what I expected. The screen is too dim.,No,Yes,Neutral,0.045485,0.650815,0.3037
4,"Exceeded my expectations! Fast, reliable, grea...",Yes,Yes,Positive,0.018263,0.006486,0.975252
