<a href="https://colab.research.google.com/github/hruthiksiva/fake-review-detection-nlp-project/blob/main/fake_review_detection_svm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
import pandas as pd # For Data Manipulation & Analysis
import numpy as np # For tools used in numerical calculation
from scipy.sparse import  hstack # horizontally stack sparse matrices -> for storing data with many zeros
import matplotlib.pyplot as plt 
import seaborn as sns # high-level data visualization library built on top of matplotlib
%matplotlib inline 
import re # For expression checking

In [9]:
contractions_dict={ # Gives proper form to contradictions
    "ain't": "are not","aren't": "are not","can't": "can not", "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not","hadn't": "had not",
    "hadn't've": "had not have",  "hasn't": "has not",
    "haven't": "have not",  "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how're": "how are",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",  "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have", "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will", "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have","shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have", "so's": "so is",
    "that'd": "that would", "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you shall have",
    "you're": "you are",
    "you've": "you have"
}

In [10]:
url="https://raw.githubusercontent.com/hruthiksiva/fake-review-detection-nlp-project/main/selected.txt" # Used to access dataset
df=pd.read_csv(url,delimiter="\t") # \t -> separate columns by tabs instead of commas
df.head() #  Displays the first 5 rows of the DataFrame df

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [11]:
df["LABEL"]=df["LABEL"].replace(['__label2__','__label1__'],['0','1']) # Converts text into numerics for easier processing
df["LABEL"]=df["LABEL"].astype(np.float32) # Ensures numeric consistincy in dataset
dfs=df[["REVIEW_TEXT","LABEL","PRODUCT_CATEGORY","VERIFIED_PURCHASE","RATING","REVIEW_TITLE"]] # Creating a new df with new cols
dfs = dfs.rename(columns={'REVIEW_TEXT': 'reviews',"LABEL":"label","VERIFIED_PURCHASE":"verify","REVIEW_TITLE":"title"}) # Simplifying names

In [12]:
dfs.head()

Unnamed: 0,reviews,label,PRODUCT_CATEGORY,verify,RATING,title
0,"When least you think so, this product will sav...",1.0,PC,N,4,useful
1,Lithium batteries are something new introduced...,1.0,Wireless,Y,4,New era for batteries
2,I purchased this swing for my baby. She is 6 m...,1.0,Baby,N,3,doesn't swing very well.
3,I was looking for an inexpensive desk calcolat...,1.0,Office Products,N,4,Great computing!
4,I only use it twice a week and the results are...,1.0,Beauty,N,4,Only use twice a week


In [30]:
dfs["verify"]=dfs["verify"].replace(['N','Y'],['0','1']) # Same step as above
dfs["RATING"].value_counts() # The ratings are displayed with reference to how many reviews have them

RATING
5    12210
4     3973
3     1868
1     1757
2     1192
Name: count, dtype: int64

In [14]:
dfs.info() # Summary of dataframe -> total/non missing vals

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reviews           21000 non-null  object 
 1   label             21000 non-null  float32
 2   PRODUCT_CATEGORY  21000 non-null  object 
 3   verify            21000 non-null  object 
 4   RATING            21000 non-null  int64  
 5   title             21000 non-null  object 
dtypes: float32(1), int64(1), object(4)
memory usage: 902.5+ KB


In [15]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence
TAG_RE = re.compile(r'<[^>]+>') # expression to match html tags

def remove_tags(text):
    return TAG_RE.sub('', text) # Remove html tags by adding a empty string in place
           
from nlppreprocess import NLP
NLP=NLP
obj=NLP(remove_stopwords=True,remove_numbers=True,replace_words=True,lemmatize=True, # Removes common words, numbers, contrdictions -> replace them & words are converted into base form
    lemmatize_method='wordnet')
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys())) # Matches any of the contradiction keys
def expand_contractions(s, contractions_dict=contractions_dict): # The function takes in sentences
     def replace(match):
        return contractions_dict[match.group(0)] # A helper function is used to do so and the expanded sentence is returned
     return contractions_re.sub(replace, s) # Applies replace function

dfs["reviews"]=dfs["reviews"].str.lower() # Converts reviews to lower case
for i in range(len(dfs["reviews"])):
    dfs.loc[i, "reviews"] = expand_contractions(str(dfs.loc[i, "reviews"]))
 # each review is converted into a string and contradictions are expanded
dfs["reviews"]=dfs["reviews"].apply(preprocess_text) # Removes HTML tags, non-alphabetic characters, single characters, and extra spaces
dfs["reviews"]=dfs["reviews"].apply(remove_tags) # Ensures that the HTML tags are removed
dfs["reviews"]=dfs["reviews"].apply(obj.process) # removes stopwords, numbers, replace words, and lemmatize the text.

In [16]:
dfs["len_review"]=dfs["reviews"].apply(lambda x :len(x.split())) # A new column -> len_reviews is created and it stores len of reviews

In [17]:
c=0
for i in dfs["len_review"]: # How many views exceeded 100 words
    if 100<i:
      c+=1
print(c)

1732


In [18]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reviews           21000 non-null  object 
 1   label             21000 non-null  float32
 2   PRODUCT_CATEGORY  21000 non-null  object 
 3   verify            21000 non-null  object 
 4   RATING            21000 non-null  int64  
 5   title             21000 non-null  object 
 6   len_review        21000 non-null  int64  
dtypes: float32(1), int64(2), object(4)
memory usage: 1.0+ MB


In [19]:
dfse = dfs[dfs["len_review"] < 100]
dfse.reset_index(drop=True, inplace=True) # first -> new index & second -> Ensures new index isnt added as a new column and assigns te back to the df

# Use .loc to update the 'label' column safely
dfse.loc[:, "label"] = dfse["label"].astype(np.int64)

dfse.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19242 entries, 0 to 19241
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reviews           19242 non-null  object 
 1   label             19242 non-null  float32
 2   PRODUCT_CATEGORY  19242 non-null  object 
 3   verify            19242 non-null  object 
 4   RATING            19242 non-null  int64  
 5   title             19242 non-null  object 
 6   len_review        19242 non-null  int64  
dtypes: float32(1), int64(2), object(4)
memory usage: 977.3+ KB


In [20]:
from sklearn import model_selection # For spliting data into training sets
from sklearn.feature_extraction.text import TfidfVectorizer # Used to convert a collection of text documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency)
from sklearn import  naive_bayes ,svm # For text classification 
from sklearn.metrics import accuracy_score # Used to calculate accuracy of the model
import scipy as sp # Used for specific and technical computing

In [21]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(dfse[["reviews","title","verify"]],dfse["label"],test_size=0.2) # 80 of the data is used for training 

In [22]:
Tfidf_vect = TfidfVectorizer(max_features=5000,ngram_range=(2,2)) # A vector used to convert text document into a matrix of TF-IDF features/top 5000 features will be kept/range when tokenizing -> pair of words
Trained_Tfidf_vect = Tfidf_vect.fit(Train_X["reviews"].values) # fit -> analyzes training set and builds vocabulary and IDF score

Train_X_Tfidf = Trained_Tfidf_vect.transform(Train_X["reviews"].values) # transform -> make the matrix based on vocabulary
Test_X_Tfidf = Trained_Tfidf_vect.transform(Test_X["reviews"].values) # same for test data

In [23]:
print (Train_X['reviews'].shape, '\n') # The number of rows and cols of reviews 

(15393,) 



In [24]:
Train_VP_Column = [ [vp] for vp in Train_X["verify"].values ] # extracts values from verify
print ("sample vp column:", Train_VP_Column[:4])
Train_VP_Column = np.array(Train_VP_Column) # Convert into array to handle more efficiently
print (Train_VP_Column.shape)

sample vp column: [['1'], ['0'], ['0'], ['0']]
(15393, 1)


In [25]:
New_Train_X_Tfidf = sp.sparse.hstack((Train_X_Tfidf,Train_VP_Column.astype(float)))  # adding "VERIFIED_PURCHASE" to the Input and horizontally stack sparse matrices
print (New_Train_X_Tfidf.shape)

(15393, 5001)


In [26]:
print(Test_X_Tfidf.shape)
Test_VP_Column = [[vp] for vp in Test_X["verify"].values]  # Create the verification column for test data
Test_VP_Column = np.array(Test_VP_Column)  # Convert it to a numpy array
print(Test_VP_Column.shape)  # Print the shape


(3849, 5000)
(3849, 1)


In [27]:
Test_VP_Column = [ [vp] for vp in Test_X["verify"].values ] 
Test_VP_Column = np.array(Test_VP_Column)
New_Test_X_Tfidf = hstack((Test_X_Tfidf, Test_VP_Column.astype(float)))

In [28]:
svmmodel= svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') # Smaller c -> simpler model, we are using linear kernel -> works on linear data, used when non-linear, specifies kernel cooeficient
svmmodel.fit(New_Train_X_Tfidf, Train_Y)

predictions_SVM = svmmodel.predict(New_Test_X_Tfidf)
print("SVM Accuracy Score:", accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score: 80.35853468433359


In [29]:
import pickle
filename = 'svmmodelp.sav'
pickle.dump(svmmodel, open(filename, 'wb'))