In [1]:
# Parameters
input_review = "I have read James Kipling's books and thoroughly enjoyed them. This one, I had to force myself to read. The ending sucked big time. I can't say it in any other way. Very disappointed. So fake. So unbelievable."
input_rating = 2.0


In [2]:
import papermill as pm
import sys

# Read input parameters
try:
    input_review = input_review  # This comes from papermill
    input_rating = input_rating
except NameError:
    input_review = "Default review"
    input_rating = 5.0  # Default values for testing

print(f"Processing Review: {input_review}")
print(f"Processing Rating: {input_rating}")


Processing Review: I have read James Kipling's books and thoroughly enjoyed them. This one, I had to force myself to read. The ending sucked big time. I can't say it in any other way. Very disappointed. So fake. So unbelievable.
Processing Rating: 2.0


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk


import string


# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('popular')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\

[nltk_data]    |   Package wordnet2021 is already up-to-date!
[nltk_data]    | Downloading package wordnet31 to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package wordnet31 is already up-to-date!
[nltk_data]    | Downloading package wordnet_ic to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package wordnet_ic is already up-to-date!
[nltk_data]    | Downloading package words to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package words is already up-to-date!
[nltk_data]    | Downloading package maxent_ne_chunker to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package maxent_ne_chunker is already up-to-date!
[nltk_data]    | Downloading package punkt to
[nltk_data]    |     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]    |   Package punkt is already up-to-date!
[nltk_data]    | Downloading package snowball_data to
[nltk_d

True

In [4]:
input_review = globals().get('input_review', "")  # Default to empty string if not provided
input_rating = globals().get('input_rating', 0.0)  # Default to 0.0 if not provided

input_rating = float(input_rating)  
input_review = str(input_review) 

# Make sure the inputs are properly received
print(f"Received Review: {input_review}")
print(f"Received Rating: {input_rating}")

Received Review: I have read James Kipling's books and thoroughly enjoyed them. This one, I had to force myself to read. The ending sucked big time. I can't say it in any other way. Very disappointed. So fake. So unbelievable.
Received Rating: 2.0


In [5]:

input_review = input_review.lower()


In [6]:
# Remove special characters

def remove_punctuation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))
input_review = remove_punctuation(input_review)



In [7]:
from nltk.corpus import stopwords
", ".join(stopwords.words("english"))


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [8]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
  return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
input_review = remove_stopwords(input_review)


In [10]:
# Removing of Frequent Words

from collections import Counter
word_count = Counter()

for text in input_review:
  for word in text.split():
    word_count[word] += 1

word_count.most_common(10)

[('e', 15),
 ('a', 9),
 ('i', 8),
 ('n', 8),
 ('o', 8),
 ('d', 7),
 ('s', 6),
 ('r', 4),
 ('k', 4),
 ('l', 4)]

In [11]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[-10:])
print(RARE_WORDS)

{'m', 'y', 'h', 'w', 'j', 'c', 'v', 'p', 'f', 'u'}


In [12]:
import re
def remove_special_char(text):
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [13]:
input_review= remove_special_char(input_review)

In [14]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [15]:
input_review = stem_words(input_review)


In [16]:
from nltk import download
download('wordnet', force=True)
download('averaged_perceptron_tagger', force=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [17]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

text = "NLTK is a powerful library for NLP."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)

print(tagged_tokens)

[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('NLP', 'NNP'), ('.', '.')]


In [18]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}

def lemmatize_words(text):
  pos_text = pos_tag(text.split())
  return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\achyu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
input_review = lemmatize_words(input_review)

In [20]:
import joblib

tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

svm_model = joblib.load('svm_model.pkl')

rf_model = joblib.load('rf_model.pkl')

lr_model = joblib.load('lr_model.pkl')

In [21]:
from scipy.sparse import hstack


In [22]:


X_lemmatized_tfidf = tfidf_vectorizer.transform([input_review])


final_features = hstack([[input_rating], X_lemmatized_tfidf])

final_text = final_features.toarray()

# Predictions from all models

svm_prediction = svm_model.predict(final_text)
rf_prediction = rf_model.predict(final_text)
lr_prediction = lr_model.predict(final_text)


labels = {0: "Fake", 1: "Real"}

print(f"SVM Prediction: {labels[svm_prediction[0]]}")
print(f"Random Forest Prediction: {labels[rf_prediction[0]]}")
print(f"Logistic Regression Prediction: {labels[lr_prediction[0]]}")

SVM Prediction: Fake
Random Forest Prediction: Real
Logistic Regression Prediction: Fake
