In [None]:
import pandas as pd
import spacy
dataset = pd.read_csv('./data/yelp_dataset_processed.csv')

In [None]:
fake_reviews = dataset[dataset['legit'] == False]['review']
legit_reviews = dataset[dataset['legit'] == True]['review']

In [None]:
fake_empty_reviews = fake_reviews[fake_reviews == '']
legit_empty_reviews = legit_reviews[legit_reviews == '']
print(f"Number of fake empty reviews: {len(fake_empty_reviews)}")
print(f"Number of legit empty reviews: {len(legit_empty_reviews)}") 

In [None]:
nlp = spacy.load("en_core_web_sm")
fake_pos = []
for doc in nlp.pipe(fake_reviews, disable=["ner"]):
    print(f"Processing fake review {len(fake_pos)+1}/{len(fake_reviews)}")
    fake_pos.append([token.pos_ for token in doc])
legit_pos = []
for doc in nlp.pipe(legit_reviews, disable=["ner"]):
    print(f"Processing legit review {len(legit_pos)+1}/{len(legit_reviews)}")
    legit_pos.append([token.pos_ for token in doc])

In [None]:
from collections import Counter
import seaborn as sns

import matplotlib.pyplot as plt

# aggregate POS counts from fake_pos (list of lists)
pos_counts = Counter()
for seq in fake_pos:
    pos_counts.update(seq)

# convert to DataFrame for plotting (pd is already imported)
pos_df = pd.DataFrame(pos_counts.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df, x='pos', y='count', order=pos_df['pos'])
plt.title('POS tag counts in fake reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# aggregate POS counts from legit_pos (list of lists)
pos_counts_legit = Counter()
for seq in legit_pos:
    pos_counts_legit.update(seq)

# convert to DataFrame for plotting (pd, sns, plt, Counter are already available)
pos_df_legit = pd.DataFrame(pos_counts_legit.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df_legit, x='pos', y='count', order=pos_df_legit['pos'])
plt.title('POS tag counts in legit reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
legit_pronouns = [pos_seq.count('PRON') for pos_seq in legit_pos]
fake_pronouns = [pos_seq.count('PRON') for pos_seq in fake_pos]
plt.figure(figsize=(10, 5))
sns.kdeplot(legit_pronouns, label='Legit Reviews', fill=True
              , color='blue', alpha=0.5)
sns.kdeplot(fake_pronouns, label='Fake Reviews', fill=True
              , color='red', alpha=0.5)
plt.title('Distribution of Pronoun Counts in Reviews')
plt.xlabel('Number of Pronouns')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
legit_top_5_words_no_pron_spacy = Counter()
for pos_seq, review in zip(legit_pos, legit_reviews):
    words = review.split()
    for pos, word in zip(pos_seq, words):
        if pos != 'PRON':
            legit_top_5_words_no_pron_spacy.update([word.lower()])
fake_top_5_words_no_pron_spacy = Counter()
for pos_seq, review in zip(fake_pos, fake_reviews):
    words = review.split()
    for pos, word in zip(pos_seq, words):
        if pos != 'PRON':
            fake_top_5_words_no_pron_spacy.update([word.lower()])
print("Top 5 words in legit reviews (excluding pronouns):")
print(legit_top_5_words_no_pron_spacy.most_common(5))  
print("Top 5 words in fake reviews (excluding pronouns):")
print(fake_top_5_words_no_pron_spacy.most_common(5))

In [None]:
# find legit reviews that are exact duplicates (copied & pasted)
legit_df = dataset[dataset['legit'] == True].copy()

# count occurrences of each review text among legit reviews
dup_counts_legit = legit_df['review'].value_counts()

# texts that appear more than once
dup_texts_legit = dup_counts_legit[dup_counts_legit > 1].index

# DataFrame with all occurrences of duplicated legit reviews and a column with the duplicate count
copied_and_pasted_legit_reviews = legit_df[legit_df['review'].isin(dup_texts_legit)].assign(
    duplicate_count=lambda d: d['review'].map(dup_counts_legit)
)

# summary info
print(f"Unique duplicated legit review texts: {len(dup_texts_legit)}")
print(f"Total legit review rows that are duplicates: {len(copied_and_pasted_legit_reviews)}")

# show the most frequent duplicated texts (top 10) and preview the duplicated rows
print("\nTop duplicated legit review texts (text -> count):")
print(dup_counts_legit[dup_counts_legit > 1].head(10))

copied_and_pasted_legit_reviews.sort_values('duplicate_count', ascending=False).head(20)

In [None]:
fake_df  = dataset[dataset['legit'] == False].copy()
fake_dup_counts = fake_df['review'].value_counts()
fake_dup_texts = fake_dup_counts[fake_dup_counts > 1].index
copied_and_pasted_fake_reviews = fake_df[fake_df['review'].isin(fake_dup_texts)].assign(
    duplicate_count=lambda d: d['review'].map(fake_dup_counts)
)
print(f"Unique duplicated fake review texts: {len(fake_dup_texts)}")
print(f"Total fake review rows that are duplicates: {len(copied_and_pasted_fake_reviews)}")
print("\nTop duplicated fake review texts (text -> count):")
print(fake_dup_counts[fake_dup_counts > 1].head(10))
copied_and_pasted_fake_reviews.sort_values('duplicate_count', ascending=False).head(20)

In [None]:
import textdescriptives as td

readability_df = td.extract_metrics(
    text=fake_reviews,          
    spacy_model="en_core_web_sm",     
    metrics=["readability"]              
)


fake_reviews = fake_reviews.join(readability_df.drop(columns=["text"]))

print("Legit reviews readbilty mean from textdescriptives:")
print(fake_reviews["readability"].mean())


print(fake_reviews.head())

In [None]:
import textdescriptives as td

readability_df = td.extract_metrics(
    text=legit_reviews,          
    spacy_model="en_core_web_sm",     
    metrics=["readability"]              
)


legit_reviews = legit_reviews.join(readability_df.drop(columns=["text"]))

print("Legit reviews readbilty mean from textdescriptives:")
print(legit_reviews["readability"].mean())

print(legit_reviews.head())