In [1]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('deceptive-opinion.csv')

# Get the unique hotel names and count
unique_hotels = df['hotel'].unique()
num_unique_hotels = len(unique_hotels)

# Get the total number of reviews
num_reviews = len(df)

# Count of reviews for each hotel
reviews_per_hotel = df['hotel'].value_counts()

# Count of positive and negative reviews
positive_reviews = df[df['polarity'] == 'positive'].shape[0]
negative_reviews = df[df['polarity'] == 'negative'].shape[0]

# Count of truthful and deceptive reviews
truthful_reviews = df[df['deceptive'] == 'truthful'].shape[0]
deceptive_reviews = df[df['deceptive'] == 'deceptive'].shape[0]

# Count of sources: MTurk and TripAdvisor
mturk_reviews = df[df['source'] == 'MTurk'].shape[0]
tripadvisor_reviews = df[df['source'] == 'TripAdvisor'].shape[0]

# Each hotel's count of positive and negative reviews
hotel_polarity_count = df.groupby(['hotel', 'polarity']).size().unstack(fill_value=0)

# Each hotel's count of source TripAdvisor or MTurk
hotel_source_count = df.groupby(['hotel', 'source']).size().unstack(fill_value=0)


In [2]:
print(f'Number of unique hotels: {num_unique_hotels}')


Number of unique hotels: 20


In [3]:
print(f'Total number of reviews: {num_reviews}')

Total number of reviews: 1600


In [5]:
print(f'Reviews per hotel:\n{reviews_per_hotel}')

Reviews per hotel:
conrad              80
hyatt               80
allegro             80
intercontinental    80
palmer              80
sofitel             80
monaco              80
james               80
hilton              80
talbott             80
hardrock            80
affinia             80
ambassador          80
swissotel           80
homewood            80
knickerbocker       80
sheraton            80
fairmont            80
omni                80
amalfi              80
Name: hotel, dtype: int64


In [6]:
print(f'Number of positive reviews: {positive_reviews}')
print(f'Number of negative reviews: {negative_reviews}')

Number of positive reviews: 800
Number of negative reviews: 800


In [7]:
print(f'Number of truthful reviews: {truthful_reviews}')
print(f'Number of deceptive reviews: {deceptive_reviews}')

Number of truthful reviews: 800
Number of deceptive reviews: 800


In [8]:
print(f'Number of MTurk reviews: {mturk_reviews}')
print(f'Number of TripAdvisor reviews: {tripadvisor_reviews}')

Number of MTurk reviews: 800
Number of TripAdvisor reviews: 400


In [10]:
print(f'Each hotel\'s count of positive and negative reviews:\n{hotel_polarity_count}')


Each hotel's count of positive and negative reviews:
polarity          negative  positive
hotel                               
affinia                 40        40
allegro                 40        40
amalfi                  40        40
ambassador              40        40
conrad                  40        40
fairmont                40        40
hardrock                40        40
hilton                  40        40
homewood                40        40
hyatt                   40        40
intercontinental        40        40
james                   40        40
knickerbocker           40        40
monaco                  40        40
omni                    40        40
palmer                  40        40
sheraton                40        40
sofitel                 40        40
swissotel               40        40
talbott                 40        40


In [11]:
print(f'Each hotel\'s count of source TripAdvisor or MTurk:\n{hotel_source_count}')

Each hotel's count of source TripAdvisor or MTurk:
source            MTurk  TripAdvisor  Web
hotel                                    
affinia              40           20   20
allegro              40           20   20
amalfi               40           20   20
ambassador           40           20   20
conrad               40           20   20
fairmont             40           20   20
hardrock             40           20   20
hilton               40           20   20
homewood             40           20   20
hyatt                40           20   20
intercontinental     40           20   20
james                40           20   20
knickerbocker        40           20   20
monaco               40           20   20
omni                 40           20   20
palmer               40           20   20
sheraton             40           20   20
sofitel              40           20   20
swissotel            40           20   20
talbott              40           20   20


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the CSV file
df = pd.read_csv('deceptive-opinion.csv')

# Preprocess the data (optional steps might include stemming, lemmatization, etc.)
# For simplicity, we will just use the text as is.

# Split the data into features and target label
X = df['text']  # the features we want to analyze
y = df['deceptive']  # the labels we want to predict

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a classifier, e.g., Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
predictions = classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Filter out the deceptive reviews
df['predicted_label'] = classifier.predict(vectorizer.transform(df['text']))
filtered_df = df[df['predicted_label'] == 'truthful']

Accuracy: 0.8625


In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score  # Add this import statement
import joblib

# Load your dataset
df = pd.read_csv('deceptive-opinion.csv')

# Split the data into features and target label
X = df['text']  # Features
y = df['deceptive']  # Target label

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a classifier, e.g., Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Create the 'models/' directory if it doesn't exist
os.makedirs('models/', exist_ok=True)

# Save the trained classifier and vectorizer
joblib.dump(classifier, 'models/classifier.pkl')
joblib.dump(vectorizer, 'models/vectorizer.pkl')

# Evaluate the classifier if needed
predictions = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 0.8625
