<a href="https://colab.research.google.com/github/Preet28/Fake-Review-Detection/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Preet28/Fake-Review-Detection.git

Cloning into 'Fake-Review-Detection'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 21 (delta 8), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (21/21), 466.79 KiB | 3.11 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [3]:
import pandas as pd
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
# Loading the dataset
df = pd.read_csv("/content/Fake-Review-Detection/deceptive-opinion.csv")
df.head(4)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...


In [5]:
# Removing unnecessary columns
df = df.drop(['hotel', 'polarity'], axis=1)
df.head(2)

Unnamed: 0,deceptive,source,text
0,truthful,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,TripAdvisor,Triple A rate with upgrade to view room was le...


In [6]:
# Downloading stopwords
nltk.download('stopwords')

# Getting the English stopwords list
stopwords = set(stopwords.words('english'))

# Clean the text data
def clean_text(text):
    # Removing punctuation
    text = re.sub('[^\w\s]', '', text)
    # Converting to lowercase
    text = text.lower()
    # Removing stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

df['text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
df.text.tail(3)

1597    intercontinental chicago magnificent mile outs...
1598    palmer house hilton looks good pictures outsid...
1599    former chicagoan im appalled amalfi hotel chic...
Name: text, dtype: object

In [9]:
# Splitting the data into features and labels
X = df['text']
y = df['deceptive']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fitting and transforming the training data
X_train = vectorizer.fit_transform(X_train)

# Transforming the testing data
X_test = vectorizer.transform(X_test)

In [15]:
# Creating an SVM classifier
clf = SVC()

# Training the model
clf.fit(X_train, y_train)

# Predicting the labels for the testing set
y_pred = clf.predict(X_test)

In [17]:
# Calculating the accuracy score
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix \n",conf_matrix)

Accuracy: 0.89375
Confusion Matrix 
 [[138  14]
 [ 20 148]]


In [19]:
report = classification_report(y_test, y_pred)
print("Classification Report : \n",report)

Classification Report : 
               precision    recall  f1-score   support

   deceptive       0.87      0.91      0.89       152
    truthful       0.91      0.88      0.90       168

    accuracy                           0.89       320
   macro avg       0.89      0.89      0.89       320
weighted avg       0.89      0.89      0.89       320

