<a href="https://colab.research.google.com/github/Preet28/Fake-Review-Detection/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Preet28/Fake-Review-Detection.git

Cloning into 'Fake-Review-Detection'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), 445.81 KiB | 799.00 KiB/s, done.


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Loading the dataset
df = pd.read_csv("/content/Fake-Review-Detection/deceptive-opinion.csv")
df.head(4)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...


In [5]:
# Removing unnecessary columns
df = df.drop(['hotel', 'polarity'], axis=1)
df.head(2)

Unnamed: 0,deceptive,source,text
0,truthful,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,TripAdvisor,Triple A rate with upgrade to view room was le...


In [6]:
import nltk
from nltk.corpus import stopwords

# Downloading stopwords
nltk.download('stopwords')

# Getting the English stopwords list
stopwords = set(stopwords.words('english'))

# Clean the text data
def clean_text(text):
    # Removing punctuation
    text = re.sub('[^\w\s]', '', text)
    # Converting to lowercase
    text = text.lower()
    # Removing stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

df['text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df.text.tail(3)

1597    intercontinental chicago magnificent mile outs...
1598    palmer house hilton looks good pictures outsid...
1599    former chicagoan im appalled amalfi hotel chic...
Name: text, dtype: object

In [9]:
# Splitting the data into features and labels
X = df['text']
y = df['deceptive']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initializing CountVectorizer
vectorizer = CountVectorizer()

# Transform the text reviews into a matrix of word counts
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
# Initializing logistic regression model
logreg = LogisticRegression()

In [12]:
# Training the model
logreg.fit(X_train, y_train)

In [13]:
# Making predictions on the testing set
y_pred = logreg.predict(X_test)

In [14]:
# Evaluate the performance of the model
accuracy = logreg.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.884375


In [15]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

   deceptive       0.86      0.91      0.88       152
    truthful       0.91      0.86      0.89       168

    accuracy                           0.88       320
   macro avg       0.88      0.89      0.88       320
weighted avg       0.89      0.88      0.88       320

