In [None]:
import pandas as pd
dataset = pd.read_csv('./data/yelp_dataset_processed.csv')

In [None]:
dataset.head()

In [None]:
dataset['legit'].value_counts()

In [None]:
positive_in_fake = dataset[(dataset['legit'] == False)]['positive'].mean()
negative_in_fake = dataset[(dataset['legit'] == False)]['negative'].mean()
neutral_in_fake = dataset[(dataset['legit'] == False)]['neutral'].mean()
print(f"Average sentiment scores in fake reviews - Positive: {positive_in_fake}, Negative: {negative_in_fake}, Neutral: {neutral_in_fake}")

positive_in_legit = dataset[(dataset['legit'] == True)]['positive'].mean()
negative_in_legit = dataset[(dataset['legit'] == True)]['negative'].mean()      
neutral_in_legit = dataset[(dataset['legit'] == True)]['neutral'].mean()
print(f"Average sentiment scores in legit reviews - Positive: {positive_in_legit}, Negative: {negative_in_legit}, Neutral: {neutral_in_legit}")


In [None]:
import matplotlib.pyplot as plt

labels = 'Positive', 'Neutral', 'Negative'
sizes_fake = [positive_in_fake, neutral_in_fake, negative_in_fake]

fig, ax = plt.subplots()
ax.pie(sizes_fake, labels=labels, autopct='%1.1f%%')
ax.set_title('Sentiment Distribution in Fake Reviews')

sizes_legit = [positive_in_legit, neutral_in_legit, negative_in_legit]
fig, ax = plt.subplots()
ax.pie(sizes_legit, labels=labels, autopct='%1.1f%%')
ax.set_title('Sentiment Distribution in Legit Reviews')


In [None]:
ratigs_fake = dataset[(dataset['legit'] == False)]['rating'].value_counts().sort_index()
fig, ax = plt.subplots()
ax.bar(ratigs_fake.index, ratigs_fake.values)
ax.set_xlabel('Rating Fake')
ax.set_ylabel('Number of Reviews')

ratings_legit = dataset[(dataset['legit'] == True)]['rating'].value_counts().sort_index()
fig, ax = plt.subplots()
ax.bar(ratings_legit.index, ratings_legit.values)
ax.set_xlabel('Rating Legit')
ax.set_ylabel('Number of Reviews')

In [None]:
print("Statistics for bot scores in fake reviews:")
print(dataset[dataset['legit'] == False]['bot'].describe())
print("Statistics for no_bot scores in fake reviews:")
dataset[dataset['legit'] == False]['no_bot'].describe()

In [None]:
print("Statistics for bot scores in legit reviews:")
print(dataset[dataset['legit'] == True]['bot'].describe())
print("Statistics for no_bot scores in legit reviews:")
dataset[dataset['legit'] == True]['no_bot'].describe()

In [None]:
print("Statistics for spam scores in fake reviews:")
print(dataset[dataset['legit'] == False]['spam'].describe())
print("Statistics for no_spam scores in fake reviews:") 
dataset[dataset['legit'] == False]['no_spam'].describe()

In [None]:
print("Statistics for spam scores in legit reviews:")
print(dataset[dataset['legit'] == True]['spam'].describe())
print("Statistics for no_spam scores in legit reviews:")
dataset[dataset['legit'] == True]['no_spam'].describe()

In [None]:
print("Statistics for subjectivity in fake reviews:")
print(dataset[dataset['legit'] == False]['subjectivity'].describe())
print("Statistics for subjectivity in legit reviews:")
dataset[dataset['legit'] == True]['subjectivity'].describe()

In [None]:
# compute review lengths (characters) and word counts, then print means for fake vs legit
dataset['char_len'] = dataset['review'].str.len()
dataset['word_count'] = dataset['review'].str.split().str.len()

mean_char_fake = dataset.loc[dataset['legit'] == False, 'char_len'].mean()
mean_char_legit = dataset.loc[dataset['legit'] == True, 'char_len'].mean()
mean_words_fake = dataset.loc[dataset['legit'] == False, 'word_count'].mean()
mean_words_legit = dataset.loc[dataset['legit'] == True, 'word_count'].mean()

print(f"Mean review length (chars) - Fake: {mean_char_fake:.1f}, Legit: {mean_char_legit:.1f}")
print(f"Mean review length (words) - Fake: {mean_words_fake:.1f}, Legit: {mean_words_legit:.1f}")

In [None]:
box_plot_with_outliers_legit_review_length = dataset[dataset['legit'] == True]['word_count']
box_plot_with_outliers_fake_review_length = dataset[dataset['legit'] == False]['word_count']
data_to_plot = [box_plot_with_outliers_legit_review_length, box_plot_with_outliers_fake_review_length]
fig, ax = plt.subplots()
ax.boxplot(data_to_plot, tick_labels=['Legit Reviews', 'Fake Reviews'])
ax.set_title('Review Lengths (Word Count) Distribution')
ax.set_ylabel('Word Count')
plt.show()

In [None]:
fake_review_sentence_lenght_char = dataset[dataset['legit'] == False]['char_len']
legit_review_sentence_lenght_char = dataset[dataset['legit'] == True]['char_len']
data_to_plot_char = [legit_review_sentence_lenght_char, fake_review_sentence_lenght_char]
fig, ax = plt.subplots()            
ax.boxplot(data_to_plot_char, tick_labels=['Legit Reviews', 'Fake Reviews'])
ax.set_title('Review Lengths (Character Count) Distribution')
ax.set_ylabel('Character Count')
plt.show()

In [None]:
# add average word length per review (characters per word)
dataset['avg_word_length'] = dataset['char_len'] / dataset['word_count'].replace(0, pd.NA)

# ensure key columns are present and keep all existing features as well
base_cols = ['id_review', 'review', 'legit', 'char_len', 'word_count', 'avg_word_length']
base_cols = [c for c in base_cols if c in dataset.columns]
# keep base columns first, then any remaining feature columns
other_cols = [c for c in dataset.columns if c not in base_cols]
features_df = dataset[base_cols + other_cols]

# save extracted features to a new CSV
features_path = './data/yelp_reviews_extracted_features.csv'
features_df.to_csv(features_path, index=False)

# merge with existing features file from the other notebook on id_review
other_path = './data/yelp_reviews_with_features.csv'
other_df = pd.read_csv(other_path)

merged_df = pd.merge(other_df, features_df, on='id_review', how='inner', validate="many_to_many")
merged_path = './data/yelp_reviews_merged_features.csv'
merged_df.to_csv(merged_path, index=False)

features_path, merged_path