<a href="https://colab.research.google.com/github/Preet28/Fake-Review-Detection/blob/main/ClassifierAlgo_unigrams_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Preet28/Fake-Review-Detection.git

Cloning into 'Fake-Review-Detection'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 59 (delta 29), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (59/59), 924.00 KiB | 4.16 MiB/s, done.
Resolving deltas: 100% (29/29), done.


In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
df = pd.read_csv('/content/Fake-Review-Detection/deceptive-opinion.csv')
df.head(3)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...


In [5]:
df = df.drop(['hotel','source','polarity'], axis = 1)
df.head(2)

Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...


In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def clean_text(text):
  text = re.sub('[^\w\s]', '', text)
  text = text.lower()
  text = ' '.join([word for word in text.split() if word not in stopwords])
  return text

df['text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
df.text.tail(3)

1597    intercontinental chicago magnificent mile outs...
1598    palmer house hilton looks good pictures outsid...
1599    former chicagoan im appalled amalfi hotel chic...
Name: text, dtype: object

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Splitting the data into features and labels
X = df['text']
y = df['deceptive']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(ngram_range = (1,1))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
classifiers = {
    'Logistic Regression' : LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'k-Nearest Neighbors (k=5)': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}

# Training and evaluating each classifier
for name, classifier in classifiers.items():
    print(f"Classifier: {name}")
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = classifier.score(X_test, y_test)
    print("Accuracy:", accuracy*100)
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix \n",conf_matrix)
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)
    print("*" * 50)

Classifier: Logistic Regression
Accuracy: 89.0625
Confusion Matrix 
 [[139  13]
 [ 22 146]]
Classification Report:
               precision    recall  f1-score   support

   deceptive       0.86      0.91      0.89       152
    truthful       0.92      0.87      0.89       168

    accuracy                           0.89       320
   macro avg       0.89      0.89      0.89       320
weighted avg       0.89      0.89      0.89       320

**************************************************
Classifier: Naive Bayes
Accuracy: 85.0
Confusion Matrix 
 [[145   7]
 [ 41 127]]
Classification Report:
               precision    recall  f1-score   support

   deceptive       0.78      0.95      0.86       152
    truthful       0.95      0.76      0.84       168

    accuracy                           0.85       320
   macro avg       0.86      0.85      0.85       320
weighted avg       0.87      0.85      0.85       320

**************************************************
Classifier: k-Nearest N