In [None]:
# Dont delete this cell although it is empty

In [2]:
# Parameters
input_review = "Just bought some of these from Walmart for $1.98....$8 seems a little ridiculous.  I haven't used them yet...trying to make an adapter to match the threaded part (my hydration pack is not Outdoor Products). Other than that they seem well designed and worth 2-5 dollars."
input_rating = 5.0


In [3]:
import papermill as pm
import sys

# Read input parameters
try:
    input_review = input_review  # This comes from papermill
    input_rating = input_rating
except NameError:
    input_review = "Default review"
    input_rating = 5.0  # Default values for testing

input_review_copy = input_review
print(f"Processing Review: {input_review}")
print(f"Processing Rating: {input_rating}")


Processing Review: Just bought some of these from Walmart for $1.98....$8 seems a little ridiculous.  I haven't used them yet...trying to make an adapter to match the threaded part (my hydration pack is not Outdoor Products). Other than that they seem well designed and worth 2-5 dollars.
Processing Rating: 5.0


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk


import string


# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('popular')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Dell\AppData\Roaming\nltk_d

True

In [5]:
input_review = globals().get('input_review', "")  # Default to empty string if not provided
input_rating = globals().get('input_rating', 0.0)  # Default to 0.0 if not provided

input_rating = float(input_rating)  
input_review = str(input_review) 

# Make sure the inputs are properly received
print(f"Received Review: {input_review}")
print(f"Received Rating: {input_rating}")

Received Review: Just bought some of these from Walmart for $1.98....$8 seems a little ridiculous.  I haven't used them yet...trying to make an adapter to match the threaded part (my hydration pack is not Outdoor Products). Other than that they seem well designed and worth 2-5 dollars.
Received Rating: 5.0


In [6]:

input_review = input_review.lower()


In [7]:
# Remove special characters

def remove_punctuation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))
input_review = remove_punctuation(input_review)



In [8]:
from nltk.corpus import stopwords
", ".join(stopwords.words("english"))


"a, about, above, after, again, against, ain, all, am, an, and, any, are, aren, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can, couldn, couldn't, d, did, didn, didn't, do, does, doesn, doesn't, doing, don, don't, down, during, each, few, for, from, further, had, hadn, hadn't, has, hasn, hasn't, have, haven, haven't, having, he, he'd, he'll, her, here, hers, herself, he's, him, himself, his, how, i, i'd, if, i'll, i'm, in, into, is, isn, isn't, it, it'd, it'll, it's, its, itself, i've, just, ll, m, ma, me, mightn, mightn't, more, most, mustn, mustn't, my, myself, needn, needn't, no, nor, not, now, o, of, off, on, once, only, or, other, our, ours, ourselves, out, over, own, re, s, same, shan, shan't, she, she'd, she'll, she's, should, shouldn, shouldn't, should've, so, some, such, t, than, that, that'll, the, their, theirs, them, themselves, then, there, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, 

In [9]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
  return " ".join([word for word in text.split() if word not in STOPWORDS])

In [10]:
input_review = remove_stopwords(input_review)


In [11]:
# Removing of Frequent Words

from collections import Counter
word_count = Counter()

for text in input_review:
  for word in text.split():
    word_count[word] += 1

word_count.most_common(10)

[('t', 15),
 ('e', 15),
 ('a', 12),
 ('r', 11),
 ('d', 11),
 ('o', 9),
 ('l', 8),
 ('s', 8),
 ('u', 6),
 ('h', 6)]

In [12]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[-10:])
print(RARE_WORDS)

{'w', 'y', 'v', 'b', 'k', '1', '2', '5', '8', '9'}


In [13]:
import re
def remove_special_char(text):
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [14]:
input_review= remove_special_char(input_review)

In [15]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [16]:
input_review = stem_words(input_review)


In [17]:
from nltk import download
download('wordnet', force=True)
download('averaged_perceptron_tagger', force=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [18]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

text = "NLTK is a powerful library for NLP."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)

print(tagged_tokens)

[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('NLP', 'NNP'), ('.', '.')]


In [19]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}

def lemmatize_words(text):
  pos_text = pos_tag(text.split())
  return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
input_review = lemmatize_words(input_review)

In [21]:
import joblib

tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

svm_model = joblib.load('svm_model.pkl')

rf_model = joblib.load('rf_model.pkl')

lr_model = joblib.load('lr_model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [22]:
from scipy.sparse import hstack


In [23]:
from scipy.sparse import hstack, csr_matrix
import numpy as np


X_lemmatized_tfidf = tfidf_vectorizer.transform([input_review])

# Reshape input_rating to be a 1x1 sparse matrix
input_rating_sparse = csr_matrix(np.reshape(input_rating, (1, 1))) 

# Now stack the features
final_features = hstack([input_rating_sparse, X_lemmatized_tfidf])

final_text = final_features.toarray()


# Predictions from all models

svm_prediction = svm_model.predict(final_text)
rf_prediction = rf_model.predict(final_text)
lr_prediction = lr_model.predict(final_text)


labels = {0: "Fake", 1: "Real"}
print(f"Review: {input_review_copy}")
print(f"Rating: {input_rating}")
print(f"SVM Prediction: {labels[svm_prediction[0]]}")
print(f"Random Forest Prediction: {labels[rf_prediction[0]]}")
print(f"Logistic Regression Prediction: {labels[lr_prediction[0]]}")

Review: Just bought some of these from Walmart for $1.98....$8 seems a little ridiculous.  I haven't used them yet...trying to make an adapter to match the threaded part (my hydration pack is not Outdoor Products). Other than that they seem well designed and worth 2-5 dollars.
Rating: 5.0
SVM Prediction: Fake
Random Forest Prediction: Real
Logistic Regression Prediction: Real
