In [None]:
import pandas as pd
import spacy
dataset = pd.read_csv('./data/yelp_dataset_processed.csv')

In [None]:
fake_reviews = dataset[dataset['legit'] == False]['review']
legit_reviews = dataset[dataset['legit'] == True]['review']

In [None]:
fake_empty_reviews = fake_reviews[fake_reviews == '']
legit_empty_reviews = legit_reviews[legit_reviews == '']
print(f"Number of fake empty reviews: {len(fake_empty_reviews)}")
print(f"Number of legit empty reviews: {len(legit_empty_reviews)}") 

In [None]:
nlp = spacy.load("en_core_web_sm")
fake_pos = []
fake_info_row = []
for doc in nlp.pipe(fake_reviews):
    print(f"Processing fake review {len(fake_pos)+1}/{len(fake_reviews)}")
    fake_pos.append([token.pos_ for token in doc])
    fake_info_row.append(doc)
legit_pos = []
legit_info_row = []
for doc in nlp.pipe(legit_reviews):
    print(f"Processing legit review {len(legit_pos)+1}/{len(legit_reviews)}")
    legit_pos.append([token.pos_ for token in doc])
    legit_info_row.append(doc)

In [None]:
from collections import Counter
import seaborn as sns

import matplotlib.pyplot as plt

# aggregate POS counts from fake_pos (list of lists)
pos_counts = Counter()
for seq in fake_pos:
    pos_counts.update(seq)

# convert to DataFrame for plotting (pd is already imported)
pos_df = pd.DataFrame(pos_counts.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df, x='pos', y='count', order=pos_df['pos'])
plt.title('POS tag counts in fake reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# aggregate POS counts from legit_pos (list of lists)
pos_counts_legit = Counter()
for seq in legit_pos:
    pos_counts_legit.update(seq)

# convert to DataFrame for plotting (pd, sns, plt, Counter are already available)
pos_df_legit = pd.DataFrame(pos_counts_legit.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df_legit, x='pos', y='count', order=pos_df_legit['pos'])
plt.title('POS tag counts in legit reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
fake_pos_pie_plot_percentages = pos_df.copy()
total_fake_pos = fake_pos_pie_plot_percentages['count'].sum()
fake_pos_pie_plot_percentages['percentage'] = fake_pos_pie_plot_percentages['count'] / total_fake_pos * 100
plt.figure(figsize=(8, 8))
plt.pie(fake_pos_pie_plot_percentages['percentage'], labels=fake_pos_pie_plot_percentages['pos'], autopct='%1.1f%%', startangle=140)
plt.title('POS tag distribution in fake reviews')
plt.show()

In [None]:
legit_pos_pie_plot_percentages = pos_df_legit.copy()
total_legit_pos = legit_pos_pie_plot_percentages['count'].sum()
legit_pos_pie_plot_percentages['percentage'] = legit_pos_pie_plot_percentages['count'] / total_legit_pos * 100
plt.figure(figsize=(8, 8))
plt.pie(legit_pos_pie_plot_percentages['percentage'], labels=legit_pos_pie_plot_percentages['pos'], autopct='%1.1f%%', startangle=140)
plt.title('POS tag distribution in legit reviews')
plt.show()

In [None]:
legit_pronouns = [pos_seq.count('PRON') for pos_seq in legit_pos]
fake_pronouns = [pos_seq.count('PRON') for pos_seq in fake_pos]
plt.figure(figsize=(10, 5))
sns.kdeplot(legit_pronouns, label='Legit Reviews', fill=True
              , color='blue', alpha=0.5)
sns.kdeplot(fake_pronouns, label='Fake Reviews', fill=True
              , color='red', alpha=0.5)
plt.title('Distribution of Pronoun Counts in Reviews')
plt.xlabel('Number of Pronouns')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
fake_pronoun_counts = Counter()
for doc in fake_info_row:
    for token in doc:
        if token.pos_ == 'PRON':
            fake_pronoun_counts[token.text.lower()] += 1

print("Most common pronouns in fake reviews:")
print(fake_pronoun_counts.most_common(10))

In [None]:
real_pronoun_counts = Counter()
for doc in legit_info_row:
    for token in doc:
        if token.pos_ == 'PRON':
            real_pronoun_counts[token.text.lower()] += 1
print("Most common pronouns in legit reviews:")
print(real_pronoun_counts.most_common(10))

In [None]:
legit_top_5_words_no_pron_only_nouns_from_legit_reviews = []
for doc in legit_info_row:
    for token in doc:
        if token.pos_ == 'NOUN':
            legit_top_5_words_no_pron_only_nouns_from_legit_reviews.append(token.text.lower())
legit_noun_counts = Counter(legit_top_5_words_no_pron_only_nouns_from_legit_reviews)
print("Most common nouns in legit reviews:")
print(legit_noun_counts.most_common(10))
fake_top_5_words_no_pron_only_nouns_from_fake_reviews = []
for doc in fake_info_row:
    for token in doc:
        if token.pos_ == 'NOUN':
            fake_top_5_words_no_pron_only_nouns_from_fake_reviews.append(token.text.lower())
fake_noun_counts = Counter(fake_top_5_words_no_pron_only_nouns_from_fake_reviews)
print("Most common nouns in fake reviews:")
print(fake_noun_counts.most_common(10))

In [None]:
# find legit reviews that are exact duplicates (copied & pasted)
legit_df = dataset[dataset['legit'] == True].copy()

# count occurrences of each review text among legit reviews
dup_counts_legit = legit_df['review'].value_counts()

# texts that appear more than once
dup_texts_legit = dup_counts_legit[dup_counts_legit > 1].index

# DataFrame with all occurrences of duplicated legit reviews and a column with the duplicate count
copied_and_pasted_legit_reviews = legit_df[legit_df['review'].isin(dup_texts_legit)].assign(
    duplicate_count=lambda d: d['review'].map(dup_counts_legit)
)

# summary info
print(f"Unique duplicated legit review texts: {len(dup_texts_legit)}")
print(f"Total legit review rows that are duplicates: {len(copied_and_pasted_legit_reviews)}")

# show the most frequent duplicated texts (top 10) and preview the duplicated rows
print("\nTop duplicated legit review texts (text -> count):")
print(dup_counts_legit[dup_counts_legit > 1].head(10))

copied_and_pasted_legit_reviews.sort_values('duplicate_count', ascending=False).head(20)

In [None]:
fake_df  = dataset[dataset['legit'] == False].copy()
fake_dup_counts = fake_df['review'].value_counts()
fake_dup_texts = fake_dup_counts[fake_dup_counts > 1].index
copied_and_pasted_fake_reviews = fake_df[fake_df['review'].isin(fake_dup_texts)].assign(
    duplicate_count=lambda d: d['review'].map(fake_dup_counts)
)
print(f"Unique duplicated fake review texts: {len(fake_dup_texts)}")
print(f"Total fake review rows that are duplicates: {len(copied_and_pasted_fake_reviews)}")
print("\nTop duplicated fake review texts (text -> count):")
print(fake_dup_counts[fake_dup_counts > 1].head(10))
copied_and_pasted_fake_reviews.sort_values('duplicate_count', ascending=False).head(20)

In [None]:
import textdescriptives as td

readability_df = td.extract_metrics(
    text=fake_reviews,          
    spacy_model="en_core_web_sm",     
    metrics=["readability"]              
)


print(readability_df.describe())

In [None]:
import textdescriptives as td

readability_df = td.extract_metrics(
    text=legit_reviews,          
    spacy_model="en_core_web_sm",     
    metrics=["readability"]              
)


print(readability_df.describe())

In [None]:
legit_review_verb_all_tense_percentages_per_sentence = []
for doc in legit_info_row:
    total_sentences = len(list(doc.sents))
    if total_sentences == 0:
        legit_review_verb_all_tense_percentages_per_sentence.append(0)
        continue
    verb_count = sum(1 for token in doc if token.pos_ == 'VERB')
    percentage_per_sentence = verb_count / total_sentences * 100
    legit_review_verb_all_tense_percentages_per_sentence.append(percentage_per_sentence)
fake_review_verb_all_tense_percentages_per_sentence = []
for doc in fake_info_row:
    total_sentences = len(list(doc.sents))
    if total_sentences == 0:
        fake_review_verb_all_tense_percentages_per_sentence.append(0)
        continue
    verb_count = sum(1 for token in doc if token.pos_ == 'VERB')
    percentage_per_sentence = verb_count / total_sentences * 100
    fake_review_verb_all_tense_percentages_per_sentence.append(percentage_per_sentence)
plt.figure(figsize=(10, 5))
sns.kdeplot(legit_review_verb_all_tense_percentages_per_sentence, label='Legit Reviews', fill=True
              , color='blue', alpha=0.5)   
sns.kdeplot(fake_review_verb_all_tense_percentages_per_sentence, label='Fake Reviews', fill=True
              , color='red', alpha=0.5)
plt.title('Distribution of Verb Counts per Sentence in Reviews')
plt.xlabel('Percentage of Verbs per Sentence')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
mean_legit_verbs_per_sentence = sum(legit_review_verb_all_tense_percentages_per_sentence) / len(legit_review_verb_all_tense_percentages_per_sentence)
mean_fake_verbs_per_sentence = sum(fake_review_verb_all_tense_percentages_per_sentence) / len(fake_review_verb_all_tense_percentages_per_sentence)
print(f"Mean percentage of verbs per sentence in legit reviews: {mean_legit_verbs_per_sentence}")
print(f"Mean percentage of verbs per sentence in fake reviews: {mean_fake_verbs_per_sentence}")


In [None]:
legit_review_punctuaction_marks_type_per_sentence = []
for doc in legit_info_row:
    punctuation_marks = [token.text for token in doc if token.is_punct]
    legit_review_punctuaction_marks_type_per_sentence.append(punctuation_marks)
fake_review_punctuaction_marks_type_per_sentence = []
for doc in fake_info_row:
    punctuation_marks = [token.text for token in doc if token.is_punct]
    fake_review_punctuaction_marks_type_per_sentence.append(punctuation_marks)
plt.figure(figsize=(10, 5))
sns.kdeplot([len(punctuations) for punctuations in legit_review_punctuaction_marks_type_per_sentence], label='Legit Reviews', fill=True
              , color='blue', alpha=0.5)    
sns.kdeplot([len(punctuations) for punctuations in fake_review_punctuaction_marks_type_per_sentence], label='Fake Reviews', fill=True
                , color='red', alpha=0.5)
plt.title('Distribution of Punctuation Marks per Review')
plt.xlabel('Number of Punctuation Marks')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
mean_legit_punctuation_marks = sum(len(punctuations) for punctuations in legit_review_punctuaction_marks_type_per_sentence) / len(legit_review_punctuaction_marks_type_per_sentence)
mean_fake_punctuation_marks = sum(len(punctuations) for punctuations in fake_review_punctuaction_marks_type_per_sentence) / len(fake_review_punctuaction_marks_type_per_sentence)
print(f"Mean number of punctuation marks in legit reviews: {mean_legit_punctuation_marks}")
print(f"Mean number of punctuation marks in fake reviews: {mean_fake_punctuation_marks}")

In [None]:
legit_review_money_mentions_percentages = []
for doc in legit_info_row:
    money_count = sum(1 for token in doc if token.pos_ == 'NUM' and ('$' in token.text or 'dollar' in token.text.lower()))
    total_tokens = len(doc)
    if total_tokens == 0:
        legit_review_money_mentions_percentages.append(0)
        continue
    percentage = money_count / total_tokens * 100
    legit_review_money_mentions_percentages.append(percentage)
plt.figure(figsize=(10, 5))
sns.kdeplot(legit_review_money_mentions_percentages, label='Legit Reviews', fill=True
              , color='blue', alpha=0.5)
plt.title('Distribution of Money Mentions Percentage in Legit Reviews')
plt.xlabel('Percentage of Money Mentions')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
fake_review_money_mentions_percentages = []
for doc in fake_info_row:
    money_count = sum(1 for token in doc if token.pos_ == 'NUM' and ('$' in token.text or 'dollar' in token.text.lower()))
    total_tokens = len(doc)
    if total_tokens == 0:
        fake_review_money_mentions_percentages.append(0)
        continue
    percentage = money_count / total_tokens * 100
    fake_review_money_mentions_percentages.append(percentage)   
plt.figure(figsize=(10, 5))
sns.kdeplot(fake_review_money_mentions_percentages, label='Fake Reviews', fill=True
                , color='red', alpha=0.5)
plt.title('Distribution of Money Mentions Percentage in Fake Reviews')
plt.xlabel('Percentage of Money Mentions')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
mean_legit_money_mentions = sum(legit_review_money_mentions_percentages) / len(legit_review_money_mentions_percentages)
mean_fake_money_mentions = sum(fake_review_money_mentions_percentages) / len(fake_review_money_mentions_percentages)
print(f"Mean percentage of money mentions in legit reviews: {mean_legit_money_mentions}")
print(f"Mean percentage of money mentions in fake reviews: {mean_fake_money_mentions}") 

In [None]:
fake_review_agreement_words_counts = []
for doc in fake_info_row:
    agreement_words = [token.text.lower() for token in doc if token.text.lower() in ['agree', 'agreed', 'agreeing', 'agreement', 'agreeable', 'agrees']]
    fake_review_agreement_words_counts.append(len(agreement_words))
plt.figure(figsize=(10, 5))
sns.kdeplot(fake_review_agreement_words_counts, label='Fake Reviews', fill=True
                , color='red', alpha=0.5)
plt.title('Distribution of Agreement Words Counts in Fake Reviews')
plt.xlabel('Number of Agreement Words')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
legit_review_agreement_words_counts = []
for doc in legit_info_row:
    agreement_words = [token.text.lower() for token in doc if token.text.lower() in ['agree', 'agreed', 'agreeing', 'agreement', 'agreeable', 'agrees']]
    legit_review_agreement_words_counts.append(len(agreement_words))
plt.figure(figsize=(10, 5))
sns.kdeplot(legit_review_agreement_words_counts, label='Legit Reviews',
                 color='blue', alpha=0.5)
plt.title('Distribution of Agreement Words Counts in Legit Reviews')
plt.xlabel('Number of Agreement Words')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
mean_fake_agreement_words = sum(fake_review_agreement_words_counts) / len(fake_review_agreement_words_counts)
mean_legit_agreement_words = sum(legit_review_agreement_words_counts) / len(legit_review_agreement_words_counts)
print(f"Mean number of agreement words in fake reviews: {mean_fake_agreement_words}")
print(f"Mean number of agreement words in legit reviews: {mean_legit_agreement_words}")

In [None]:
from collections import defaultdict


all_docs = list(legit_info_row) + list(fake_info_row)

pos_tags = sorted({token.pos_ for doc in all_docs for token in doc})
punct_types = sorted({token.text for doc in all_docs for token in doc if token.is_punct})
all_tenses = sorted({
    tense
    for doc in all_docs
    for token in doc
    if token.pos_ == 'VERB'
    for tense in token.morph.get('Tense')
})

punct_types_set = set(punct_types)
all_tenses_set = set(all_tenses)


feature_cols = []
feature_cols.extend(['money_count', 'money_ratio_sent'])
feature_cols.extend(['pronoun_count', 'pronoun_ratio_sent'])
feature_cols.extend(['verb_count', 'verb_ratio_sent'])

for tense in all_tenses:
    feature_cols.extend([f'verb_tense_{tense}_count', f'verb_tense_{tense}_ratio_sent'])
for tag in pos_tags:
    feature_cols.extend([f'pos_{tag}_count', f'pos_{tag}_ratio_sent'])
for sym in punct_types:
    safe_sym = sym.replace(' ', '_')
    feature_cols.extend([f'punct_{safe_sym}_count', f'punct_{safe_sym}_ratio_sent'])


def extract_doc_features_fast(doc):
    total_counts = defaultdict(int)
    ratio_sums = defaultdict(float)
    
    sents = list(doc.sents)
    
    valid_sent_count = 0
    
    for sent in sents:
        # Filter tokens once per sentence
        tokens = [t for t in sent if not t.is_space]
        sent_len = len(tokens)
        
        if sent_len == 0:
            continue
            
        valid_sent_count += 1
        inv_sent_len = 1.0 / sent_len
        
        # Local counters for this specific sentence
        sent_counts = defaultdict(int)
        
        for t in tokens:
            # --- POS Tags ---
            # We use the raw tag directly
            sent_counts[f'pos_{t.pos_}'] += 1
            
            # --- Punctuation ---
            if t.is_punct and t.text in punct_types_set:
                safe_sym = t.text.replace(' ', '_')
                sent_counts[f'punct_{safe_sym}'] += 1
            
            # --- Verbs & Tenses ---
            if t.pos_ == 'VERB':
                sent_counts['verb'] += 1
                tenses = t.morph.get('Tense')
                for tense in tenses:
                    if tense in all_tenses_set:
                        sent_counts[f'verb_tense_{tense}'] += 1
            
            # --- Money ---
            if t.pos_ == 'NUM' and ('$' in t.text or 'dollar' in t.text.lower()):
                sent_counts['money'] += 1
                
            # --- Pronouns ---
            if t.pos_ == 'PRON':
                sent_counts['pronoun'] += 1

        # End of Token Loop: Update Global Accumulators
        for key, count in sent_counts.items():
            total_counts[key] += count
            ratio_sums[key] += (count * inv_sent_len)

    # 4. Final Formatting
    if valid_sent_count == 0:
        return {}

    # Merge counts and calculated average ratios into one dictionary
    features = {}
    
    for key, count in total_counts.items():
        features[f'{key}_count'] = count
        features[f'{key}_ratio_sent'] = ratio_sums[key] / valid_sent_count
        
    return features

# 5. Build DataFrames
print("Extracting features...")
legit_features = [extract_doc_features_fast(doc) for doc in legit_info_row]
fake_features = [extract_doc_features_fast(doc) for doc in fake_info_row]


legit_features_df = pd.DataFrame(legit_features, index=legit_reviews.index).reindex(columns=feature_cols, fill_value=0.0)
fake_features_df = pd.DataFrame(fake_features, index=fake_reviews.index).reindex(columns=feature_cols, fill_value=0.0)

features_df = pd.concat([legit_features_df, fake_features_df]).sort_index()

# Attach review text and label, then sav 
final_df = dataset[['id_review', 'review', 'legit']].join(features_df)

output_path = './data/yelp_reviews_with_features.csv'
final_df.to_csv(output_path, index=False)

print(f"Saved {len(final_df)} rows with {len(final_df.columns)} columns to {output_path}")