In [1]:
# Importing Packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats


In [16]:
# Importing Data with Features 
data_hedonic = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Hedonic_Final.csv')
data_utilitarian = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Utilitarian_Final.csv')

In [17]:
# Importing statistical data
data_prod_stat = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/Data Analysis/Results/Helpful/Summary_Stats_ProdType_mean.csv')

In [18]:
# Combining data for analysis
data = pd.concat([data_hedonic, data_utilitarian])

In [19]:
# Filtering data with helpful votes
data = data[data['helpful_vote'] != 0]


In [20]:
# Print the column names
print(data.columns)

Index(['rating', 'title_x', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'text_cleaned',
       'text_cleaned1', 'sentiment', 'main_category', 'prod_title',
       'average_rating', 'rat_count', 'features', 'price', 'helpful_ratio',
       'noun_count', 'adj_count', 'adv_count', 'word_count', 'sent_count',
       'sent_length', 'title_length', 'FRE', 'review_ext', 'elap_days',
       'image', 'year', 'month', 'hour', 'day_of_week', 'is_weekend',
       'product', 'ver_purch', '#nouns', '#adj', '#adv', 'subjective_score',
       'neutral_score', 'prod_type', 'Sentiment_Classification',
       'total_helpful_votes'],
      dtype='object')


In [21]:
# Get the unique products
unique_products = data['main_category'].unique()

# Print the unique products
print("Unique Products:")
for product in unique_products:
    print(product)

Unique Products:
Video Games
Beauty
Music
Household
Electronics
Appliances


In [22]:
# Dictionary to store product data
products = {
    'Video Games': data[data['main_category'] == 'Video Games'],
    'Beauty': data[data['main_category'] == 'Beauty'],
    'Grocery': data[data['main_category'] == 'Grocery'],
    'Music': data[data['main_category'] == 'Music'],
    'Electronics': data[data['main_category'] == 'Electronics'],
    'Appliances': data[data['main_category'] == 'Appliances'],
    'Personal Care': data[data['main_category'] == 'Personal Care'],
    'Household': data[data['main_category'] == 'Household'],
}

In [23]:
# Convert Sentiment_Classification to Numerical
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}

# Map the sentiment column to numerical values
data['Sentiment'] = data['Sentiment_Classification'].map(sentiment_mapping)


In [24]:
# Defining features
features = [
    'rating', 'review_ext','Sentiment','subjective_score', 'word_count', 'sent_count', 'sent_length', 
    'title_length','FRE', '#adj', '#adv','#nouns', 'elap_days'
]

In [25]:
from scipy.stats import f_oneway


# Perform ANOVA for each feature
for feature in features:
    groups = [data[data['main_category'] == category][feature] for category in data['main_category'].unique()]
    
    f_stat, p_value = f_oneway(*groups)
    if p_value < 0.05:  # Considering statistical significance at alpha = 0.05
        print(f"Feature: {feature}")
        print(f"F-statistic: {f_stat}, p-value: {p_value}")
        print()

Feature: rating
F-statistic: 180.00500009033212, p-value: 2.3640081433353958e-179

Feature: review_ext
F-statistic: 204.14596755927553, p-value: 5.933902039652506e-202

Feature: Sentiment
F-statistic: 199.01049755633738, p-value: 3.546248557786319e-197

Feature: subjective_score
F-statistic: 18.090711209507568, p-value: 7.342293778216329e-18

Feature: word_count
F-statistic: 42.043010770168486, p-value: 1.0199055479691973e-42

Feature: sent_count
F-statistic: 26.806001568854164, p-value: 6.605175252233679e-27

Feature: sent_length
F-statistic: 20.07874170682117, p-value: 6.39452212618974e-20

Feature: title_length
F-statistic: 10.492299110424309, p-value: 4.790028017735516e-10

Feature: FRE
F-statistic: 42.74880435904783, p-value: 1.8974462352129058e-43

Feature: #adj
F-statistic: 35.257800438093405, p-value: 1.0992540863535641e-35

Feature: #adv
F-statistic: 11.97657901668187, p-value: 1.464813590320785e-11

Feature: #nouns
F-statistic: 10.999770828447259, p-value: 1.457167975548254e-

In [27]:
# Initialize a list to store the results
results = []

# Perform ANOVA for each feature
for feature in features:
    groups = [data[data['main_category'] == category][feature] for category in data['main_category'].unique()]
    
    f_stat, p_value = f_oneway(*groups)
    
    if p_value < 0.05:  # Considering statistical significance at alpha = 0.05
        means = [group.mean() for group in groups]
        result = {'Feature': feature}
        
        for idx, category in enumerate(data['main_category'].unique()):
            result[f'Mean_{category}'] = means[idx]
        
        result['F-statistic'] = f_stat
        result['p-value'] = p_value
        
        results.append(result)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

             Feature  Mean_Video Games  Mean_Beauty  Mean_Music  \
0             rating          3.758315     4.029299    2.788835   
1         review_ext         -1.041685    -0.511635   -1.854369   
2          Sentiment          1.294900     1.485775    0.936893   
3   subjective_score          0.807833     0.822641    0.834365   
4         word_count         69.121951    50.067516   59.092233   
5         sent_count          4.075388     3.705732    4.555825   
6        sent_length         17.138913    15.415137   14.342856   
7       title_length          1.492239     1.264544    1.771845   
8                FRE         82.037450    83.340144   83.102015   
9               #adj          0.083032     0.101336    0.102463   
10              #adv          0.063625     0.073082    0.068645   
11            #nouns          0.202094     0.181166    0.168750   
12         elap_days       1012.689579  1928.230573  805.851942   

    Mean_Household  Mean_Electronics  Mean_Appliances  F-stat