# Data Analysis

In this section we will have a look at the difference in products between hedonic and utilitarian dirven products. We will see if we can find any patterns in the data from a descriptive level and then move on to a more inferential level.

## Packages and Importing Data

In [1]:
# Installing packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nltk import bigrams
from wordcloud import WordCloud


In [3]:
data_utilitarian = pd.read_csv('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Total_Features_utilitarian.csv')
data_hedonic = pd.read_csv('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Total_Features_hedonic.csv')


In [4]:
# importing data
data_utilitarian_filter = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_filter.csv')

data_utilitarian_razor = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_razor.csv')

data_utilitarian_mouse = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_utilitarian_mouse.csv')

data_hedonic_chocolate = pd.read_csv ('/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Features_hedonic_choclate.csv')


# Functions for data analysis

In [None]:

# List of your DataFrame variables
data_frames = [data_hedonic, data_utilitarian, data_utilitarian_mouse, data_utilitarian_filter, data_utilitarian_razor,data_hedonic_chocolate]

# Convert 'timestamp' column to datetime format for all DataFrames
for df in data_frames:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Verify that the conversion was successful
for i, df in enumerate(data_frames):
    print(f"DataFrame {i+1}:\n{df['timestamp'].dtype}\n")


In [None]:
def plot_reviews_by_year(df, title_suffix=None):
    # Extract the year from the timestamp
    df['year'] = df['timestamp'].dt.year

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    df['year'].hist(bins=len(df['year'].unique()), color='blue', alpha=0.5)
    if title_suffix:
        plt.title(f'Distribution of Reviews by Year - {title_suffix}')
    else:
        plt.title('Distribution of Reviews by Year')
    plt.xlabel('Year')
    plt.ylabel('Number of Reviews')
    plt.grid(True)
    plt.show()

In [None]:

def avg_rating_by_year(df):
    # Extract the year from the timestamp
    df['year'] = df['timestamp'].dt.year

    # Group by year and calculate the average rating
    avg_rating_by_year = df.groupby('year')['rating'].mean()

    # Plot the line chart
    plt.figure(figsize=(10, 6))
    plt.plot(avg_rating_by_year.index, avg_rating_by_year.values, color='blue', marker='o', linestyle='-')
    plt.title('Average Rating of Reviews by Year')
    plt.xlabel('Year')
    plt.ylabel('Average Rating')
    plt.grid(True)
    plt.show()

# Example usage:
# plot_avg_rating_by_year(your_dataframe)


In [None]:
def avg_rating_and_sentiment_over_time(df, product_name):
    # Group the data by timestamp and calculate the average rating for each time period
    average_rating_over_time = df.groupby(pd.Grouper(key='timestamp', freq='M'))['rating'].mean()

    # Group the data by timestamp and calculate the average sentiment score for each time period
    average_sentiment_over_time = df.groupby(pd.Grouper(key='timestamp', freq='M'))['sentiment'].mean()

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    color = 'tab:blue'
    ax.plot(average_rating_over_time.index, average_rating_over_time.values, label='Average Rating', color=color)
    ax.set_xlabel('Date')
    ax.set_ylabel('Average Rating', color=color)
    ax.tick_params(axis='y', labelcolor=color)
    ax.grid(True)

    ax2 = ax.twinx()
    color = 'tab:red'
    ax2.plot(average_sentiment_over_time.index, average_sentiment_over_time.values, label='Average Sentiment Score', color=color)
    ax2.set_ylabel('Average Sentiment Score', color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.suptitle(f'Average Rating and Sentiment Over Time for {product_name}')
    fig.tight_layout()

    plt.show()



In [None]:
# Function for showing the amount on reviews by year

def rating_counts_by_year(df):
    # Extract the year from the timestamp
    df['year'] = df['timestamp'].dt.year

    # Map ratings to categories (1 and 2 as negative, 3 as neutral, 4 and 5 as positive)
    rating_categories = {1: 'Negative', 2: 'Negative', 3: 'Neutral', 4: 'Positive', 5: 'Positive'}
    df['rating_category'] = df['rating'].map(rating_categories)

    # Group by year and rating category, then count the occurrences
    rating_counts_by_year = df.groupby(['year', 'rating_category']).size().unstack(fill_value=0)

    # Plot the stacked bar plot
    plt.figure(figsize=(10, 6))
    rating_counts_by_year.plot(kind='bar', stacked=True, color=['red', 'grey', 'green'], alpha=0.7)
    plt.title('Rating Distribution by Year')
    plt.xlabel('Year')
    plt.ylabel('Number of Ratings')
    plt.legend(title='Rating Category', loc='upper left')
    plt.grid(True)
    plt.show()


In [None]:
# Showing sentiment distribution by year

def sentiment_by_year(df):
    # Extract the year from the timestamp
    df['year'] = df['timestamp'].dt.year

    # Plot the box plot
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='year', y='sentiment', data=df, palette='Set2')
    plt.title('Distribution of Text Sentiment by Year')
    plt.xlabel('Year')
    plt.ylabel('Text Sentiment')
    plt.grid(True)
    plt.show()


# 1. Basic statistic

In [None]:
# Quick overlook of the data
print (data_utilitarian_mouse.describe())


In [None]:
print (data_utilitarian_filter.describe())


In [None]:
print (data_utilitarian_razor.describe())

In [None]:
rating_counts_by_year (data_utilitarian_mouse)
rating_counts_by_year (data_utilitarian_filter)
rating_counts_by_year (data_utilitarian_razor)
rating_counts_by_year (data_hedonic_chocolate)

In [None]:
avg_rating_and_sentiment_over_time(data_hedonic, 'Chocolate')
avg_rating_and_sentiment_over_time(data_hedonic, 'Facial Spray')
avg_rating_and_sentiment_over_time(data_utilitarian, 'Gaming Mouse')
avg_rating_and_sentiment_over_time(data_utilitarian, 'Razor')



## Analyzing Helpfulness

In [None]:
# Function for plotting helpfulness distribution

def plot_helpfulness_votes_over_time(data_frame):
    # Group by timestamp and calculate the sum of helpfulness votes
    votes_by_timestamp = data_frame.groupby('timestamp')['helpful_vote'].sum()

    # Filter out timestamps with less than or equal to 1 helpful vote
    votes_by_timestamp = votes_by_timestamp[votes_by_timestamp > 1]

    # Plot the data
    plt.figure(figsize=(10, 6))
    plt.scatter(votes_by_timestamp.index, votes_by_timestamp, color='blue', alpha=0.5)
    plt.title('Helpfulness Votes over Time')
    plt.xlabel('Timestamp')
    plt.ylabel('Total Helpful Votes')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()


In [None]:
helpful_reviews_count = data_utilitarian_mouse[data_utilitarian_mouse['helpful_vote'] > 0].shape[0]
print("Number of reviews with at least one helpful rating:", helpful_reviews_count)

In [None]:
helpful_reviews_count = data_utilitarian_filter[data_utilitarian_filter['helpful_vote'] > 0].shape[0]
print("Number of reviews with at least one helpful rating:", helpful_reviews_count)

In [None]:
helpful_reviews_count = data_utilitarian_razor[data_utilitarian_razor['helpful_vote'] > 0].shape[0]
print("Number of reviews with at least one helpful rating:", helpful_reviews_count)

In [None]:
helpful_reviews_count = data_hedonic_chocolate[data_hedonic_chocolate['helpful_vote'] > 0].shape[0]
print("Number of reviews with at least one helpful rating:", helpful_reviews_count)

## 3. Text Analysis Wordclouds 

In [None]:
# Generating Wordclouds

def generate_wordcloud_reviews_by_sentiment(df):
    # Define rating categories
    rating_categories = {1: 'Negative', 2: 'Negative', 3: 'Neutral', 4: 'Positive', 5: 'Positive'}

    # Create subplots
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    # Iterate over each sentiment category
    for ax, sentiment in zip(axes.flatten(), ['Negative', 'Neutral', 'Positive']):
        # Filter out reviews based on sentiment
        reviews = df[df['rating'].map(rating_categories) == sentiment]
        
        # Drop rows where 'text_cleaned1' column contains NaN values
        reviews.dropna(subset=['text_cleaned1'], inplace=True)
        
        # Concatenate the cleaned text of reviews
        text = ' '.join(reviews['text_cleaned1'])

        # Generate bigrams from the text
        bigram_list = list(bigrams(text.split()))

        # Create a WordCloud object with bigrams
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate_from_frequencies(dict(bigram_list))

        # Plot the WordCloud
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Word Cloud of {sentiment} Reviews (Bigrams)')
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()