In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [2]:
data = pd.read_csv("fake reviews dataset.csv" , usecols=["category", "rating" ,"label", "text_"])

In [3]:
data.drop('rating', inplace=True, axis=1)
data.drop('category', inplace=True, axis=1)

In [4]:
data.head()

Unnamed: 0,label,text_
0,CG,"Love this! Well made, sturdy, and very comfor..."
1,CG,"love it, a great upgrade from the original. I..."
2,CG,This pillow saved my back. I love the look and...
3,CG,"Missing information on how to use it, but it i..."
4,CG,Very nice set. Good quality. We have had the s...


In [5]:
data['label'].replace(['CG' , 'OR'] , [0 , 1] , inplace = True)
labels = data['label']
data.drop('label', inplace=True, axis=1)

In [6]:
train_X , test_X , train_Y , test_Y = train_test_split(data , labels , test_size=0.2)

In [7]:
X_train, X_test = [] , [] 
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
          
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
def pre_process(text):
    if isinstance(text , str) :
        text = remove_emojis(text)
        text = text.lower()
        text = re.sub(r'\d+' , "" , text)
        text = re.sub(r"http\S+|www\S+|https\S+" , "" , text , flags = re.MULTILINE)
        text = text.translate(str.maketrans("" , "" , string.punctuation))
        text = re.sub(r'\@\w+|\#\w+' , "" , text)
    
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
                                 
        stemmed_words = [ps.stem(word) for word in filtered_words]
                                 
        lemmatized_words = [lemmatizer.lemmatize(word , pos = 'a') for word in stemmed_words]
        
        return " ".join(lemmatized_words)
    
for review in train_X['text_']:
    text = pre_process(review)
    X_train.append(text)

for review in test_X['text_']:
    text = pre_process(review)
    X_test.append(text)

In [8]:
Tfidf_vector = TfidfVectorizer(ngram_range = (2,2))
traindata = Tfidf_vector.fit_transform(X_train)
testdata = Tfidf_vector.transform(X_test)

In [9]:
random_forest = RandomForestClassifier(n_estimators=50, random_state=42)
random_forest.fit(traindata, train_Y)

In [None]:
#multiNB = MultinomialNB()
#multiNB.fit(traindata , train_Y)

In [10]:
def evaluation(model) :
    predictions = model.predict(testdata)
    cm_matrix = confusion_matrix(test_Y , predictions)
    score = accuracy_score(test_Y , predictions)
    report = classification_report(test_Y , predictions)
    print("--------------->CONFUSION MATRIX<----------------")
    print(cm_matrix)
    print("\n\n")
    print("--------------->ACCURACY SCORE<----------------")
    print(score)
    print("\n\n")
    print("--------------->CLASSIFICATION REPORT<----------------")
    print(report)
    print("\n\n")
    return

In [11]:
def test() :
    while True :
        txt = input(" ENTER YOUR REVIEW , EXIT FOR EXITING ")
        txt = txt.lower()
        if txt == "exit" :
            break
        text = []
        text.append(pre_process(txt))
        test = Tfidf_vector.transform(text)
        prediction = random_forest.predict(test)
        if prediction == 1 :
            print("Review is REAL")
        else :
            print("Review is FAKE")
            
    return

In [12]:
def main() :
    choice = -1
    while True :
        choice = int(input(" 1 : evaluation \n 2 : test  \n anything other : exit \n"))
        if choice == 1 :
            evaluation(random_forest)
        elif choice == 2:
            test()
        else :
            break
            
    return

In [13]:
main()

 1 : evaluation 
 2 : test  
 anything other : exit 
 1


--------------->CONFUSION MATRIX<----------------
[[3775  319]
 [1371 2622]]



--------------->ACCURACY SCORE<----------------
0.7910226289105973



--------------->CLASSIFICATION REPORT<----------------
              precision    recall  f1-score   support

           0       0.73      0.92      0.82      4094
           1       0.89      0.66      0.76      3993

    accuracy                           0.79      8087
   macro avg       0.81      0.79      0.79      8087
weighted avg       0.81      0.79      0.79      8087






 1 : evaluation 
 2 : test  
 anything other : exit 
 4
