In [2]:
#wurde mithilfe von ChatGPT(LLM) by OpenAI, 2024 erstellt

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

# Load the data from the CSV file, specifying semicolon as the delimiter
file_path = 'final_all_data_cleaned_with_categories.csv'
df = pd.read_csv(file_path, delimiter=';')

# Standardize the column names by stripping spaces and making them lowercase
df.columns = df.columns.str.strip().str.lower()

# Remove rows with missing values in the relevant columns
df = df[df['category'].notna()]
df = df[df['avg_rating'].notna()]

# Group by category and calculate mean and standard error for avg_rating, number_of_ratings, and price
grouped = df.groupby('category').agg(
    mean_rating=('avg_rating', 'mean'),
    count_rating=('avg_rating', 'count'),
    std_rating=('avg_rating', 'std'),
    mean_price=('price', 'mean'),
    count_price=('price', 'count'),
    std_price=('price', 'std'),
    mean_ratings_count=('number_of_ratings', 'mean'),
    count_ratings_count=('number_of_ratings', 'count'),
    std_ratings_count=('number_of_ratings', 'std')
)

# Calculate standard errors and confidence intervals
grouped['se_rating'] = grouped['std_rating'] / np.sqrt(grouped['count_rating'])
grouped['se_price'] = grouped['std_price'] / np.sqrt(grouped['count_price'])
grouped['se_ratings_count'] = grouped['std_ratings_count'] / np.sqrt(grouped['count_ratings_count'])

# Calculate 95% confidence intervals
confidence_level = 0.95
z_score = stats.norm.ppf((1 + confidence_level) / 2)

grouped['ci_rating'] = z_score * grouped['se_rating']
grouped['ci_price'] = z_score * grouped['se_price']
grouped['ci_ratings_count'] = z_score * grouped['se_ratings_count']

# Function to plot bar graphs with confidence intervals and save them
def plot_and_save(grouped, metric, ci, title, ylabel, filename, color, offset_positive, offset_negative):
    plt.figure(figsize=(12, 8))  # Increased figure size for better label visibility
    bars = plt.bar(grouped.index, grouped[metric], yerr=grouped[ci], capsize=5, color=color, alpha=0.7)
    for i, bar in enumerate(bars):
        height = bar.get_height()
        offset = offset_positive if i % 2 == 0 else offset_negative
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + offset,
            f"{height:.2f}",
            ha='center',
            va='bottom' if offset > 0 else 'top',
            color='black',
            fontsize=10,
        )
    plt.xticks(rotation=45, ha='right')
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.subplots_adjust(bottom=0.2)  # Adjust bottom margin to accommodate rotated labels
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

# Plot and save average rating
plot_and_save(
    grouped,
    'mean_rating',
    'ci_rating',
    'Average Rating by Category with 95% CI',
    'Average Rating',
    'average_rating_by_category.png',
    'blue',
    0.1,
    -0.1
)

# Plot and save average price
plot_and_save(
    grouped,
    'mean_price',
    'ci_price',
    'Average Price by Category with 95% CI',
    'Average Price',
    'average_price_by_category.png',
    'green',
    10,
    -10
)

# Plot and save number of ratings
plot_and_save(
    grouped,
    'mean_ratings_count',
    'ci_ratings_count',
    'Average Number of Ratings by Category with 95% CI',
    'Average Number of Ratings',
    'average_number_of_ratings_by_category.png',
    'orange',
    1,
    -1
)

