### Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 


### Load Dataset

In [None]:
reviews_df = pd.read_csv("amazon_reviews.txt", sep="\t")
reviews_df.head()

tar: Old option 'g' requires an argument.
Try 'tar --help' or 'tar --usage' for more information.


In [None]:
reviews_df.groupby("LABEL").count()

### Convert Label to 0/1

In [None]:
def label_to_int(label):
    if label == "__label2__":
        # Real Review
        return 0
    else:
        # Fake Review
        return 1

reviews_df["FRAUD_LABEL"] = reviews_df["LABEL"].apply(label_to_int)
reviews_df.head()

### EDA: Review Distribution across Categories

In [None]:
axes = reviews_df.groupby("FRAUD_LABEL").PRODUCT_CATEGORY.value_counts().unstack(0).plot.barh()
axes.set_xlabel("# Reviews")
axes.set_ylabel("Product Category")

In [None]:
axes = reviews_df.groupby("FRAUD_LABEL").RATING.value_counts().unstack(0).plot.bar()
axes.set_xlabel("# Reviews")
axes.set_ylabel("Rating")

In [None]:
axes = reviews_df.groupby("FRAUD_LABEL").VERIFIED_PURCHASE.value_counts().unstack(0).plot.bar()
axes.set_xlabel("Purchase Verified")
axes.set_ylabel("Rating")

### Feature Extraction

In [None]:

def review_length(text):
    if text is None:
        return 0
    else:
        words = text.split(" ")
        return len(words)
    


In [None]:
def average_word_length(text):
    if text is None or text == "":
        return 0
    else:
        words = text.split(" ")
        total_lengths = 0
        for word in words:
            total_lengths = total_lengths + len(word)
            
        avg_len = total_lengths/len(words)
        return avg_len


In [None]:

import enchant
english_dict = enchant.Dict("en_US")
def count_misspellings(text):
    global english_dict
    if text is None or text == "":
        return 0
    else:
        misspelling = 0
        words = text.split(" ")
        for word in words:
            if word != "" and not english_dict.check(word.lower()):
                misspelling = misspelling + 1
        return misspelling
    



In [None]:
reviews_df["Review_Text_Length"] = reviews_df["REVIEW_TEXT"].apply(review_length)
reviews_df["Avg_Word_Len"] = reviews_df["REVIEW_TEXT"].apply(average_word_length) 
reviews_df["Num_Misspelling"] = reviews_df["REVIEW_TEXT"].apply(count_misspellings)

### Visualizing Feature Differences & T-Test

In [None]:
import matplotlib.pyplot as plt

# Separate real and fake reviews
fake_reviews = reviews_df[reviews_df['FRAUD_LABEL'] == 1]['Review_Text_Length'].values 
real_reviews = reviews_df[reviews_df['FRAUD_LABEL'] == 0]['Review_Text_Length'].values 

# Plot the two histograms
bins = np.linspace(0, 500, 500)
plt.hist(fake_reviews, bins, alpha=0.5, label='Fake')
plt.hist(real_reviews, bins, alpha=0.5, label='Real')

# Label the plot
plt.xlabel("Review Length")
plt.ylabel("# Reviews")
plt.legend()

# Display the plot 
plt.show()

In [None]:
from scipy.stats import ttest_ind

# Conduct t-test
t_stat, p_value = ttest_ind(fake_reviews, real_reviews)

# Print group means
print("Mean in Fake Reviews: ", np.mean(fake_reviews))
print("Mean in Real Reviews: ", np.mean(real_reviews))

# Print t-test statistics
print("T-statistic value: ", t_stat)  
print("P-Value: ", p_value)

In [None]:
import matplotlib.pyplot as plt

# Separate real and fake reviews
fake_reviews = reviews_df[reviews_df['FRAUD_LABEL'] == 1]['Num_Misspelling'].values 
real_reviews = reviews_df[reviews_df['FRAUD_LABEL'] == 0]['Num_Misspelling'].values 

# Plot the two histograms
bins = np.linspace(0, 50, 50)
plt.hist(fake_reviews, bins, alpha=0.5, label='Fake')
plt.hist(real_reviews, bins, alpha=0.5, label='Real')

# Label the plot
plt.xlabel("# Misspelt Words")
plt.ylabel("# Reviews")
plt.legend()

# Display the plot 
plt.show()

In [None]:
from scipy.stats import ttest_ind

# Conduct t-test
t_stat, p_value = ttest_ind(fake_reviews, real_reviews)

# Print group means
print("Mean in Fake Reviews: ", np.mean(fake_reviews))
print("Mean in Real Reviews: ", np.mean(real_reviews))

# Print t-test statistics
print("T-statistic value: ", t_stat)  
print("P-Value: ", p_value)

### OLS Regression

In [None]:
import statsmodels.formula.api as smf

model = smf.ols(formula = """FRAUD_LABEL ~ Review_Text_Length 
                                          + Num_Misspelling 
                                          + Avg_Word_Len """, 
                data = reviews_df).fit()

print(model.summary())

In [None]:
import statsmodels.formula.api as smf

model = smf.ols(formula = """FRAUD_LABEL ~ Review_Text_Length 
                                          + Num_Misspelling 
                                          + Avg_Word_Len 
                                          + C(RATING) 
                                          + C(VERIFIED_PURCHASE)""", 
                data = reviews_df).fit()

print(model.summary())