In [None]:
import pandas as pd
dataset = pd.read_csv('./data/yelp_dataset_processed.csv')

In [None]:
fake_reviews = dataset[dataset['legit'] == False]['review']
legit_reviews = dataset[dataset['legit'] == True]['review']

In [None]:
import spacy
nlp = spacy.load("en_core_web_trf")
fake_pos = []
for doc in nlp.pipe(fake_reviews, disable=["ner"]):
    fake_pos.append([token.pos_ for token in doc])
legit_pos = []
for doc in nlp.pipe(legit_reviews, disable=["ner"]):
    legit_pos.append([token.pos_ for token in doc])

In [None]:
from collections import Counter
import seaborn as sns

import matplotlib.pyplot as plt

# aggregate POS counts from fake_pos (list of lists)
pos_counts = Counter()
for seq in fake_pos:
    pos_counts.update(seq)

# convert to DataFrame for plotting (pd is already imported)
pos_df = pd.DataFrame(pos_counts.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df, x='pos', y='count', order=pos_df['pos'])
plt.title('POS tag counts in fake reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# aggregate POS counts from legit_pos (list of lists)
pos_counts_legit = Counter()
for seq in legit_pos:
    pos_counts_legit.update(seq)

# convert to DataFrame for plotting (pd, sns, plt, Counter are already available)
pos_df_legit = pd.DataFrame(pos_counts_legit.items(), columns=['pos', 'count']).sort_values('count', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df_legit, x='pos', y='count', order=pos_df_legit['pos'])
plt.title('POS tag counts in legit reviews')
plt.xlabel('POS tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()