In [1]:
import pandas as pd
import json
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import numpy as np

# Function to load the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    return pd.DataFrame(data)

# Function to calculate basic statistics
def calculate_basic_stats(df):
    return {
        "Total Reviews": len(df),
        "Average Rating": df['overall'].mean(),
        "Average Review Length": df['reviewText'].str.len().mean(),
        "Average Summary Length": df['summary'].str.len().mean()
    }

# Function to find the most common words
def get_most_common_words(text_series, num_words=10):
    combined_text = ' '.join(text_series.dropna())
    words = re.findall(r'\b[a-z]+\b', combined_text.lower())
    return Counter(words).most_common(num_words)

def analyze_category(data, category_name):
    """Function to perform EDA on a given category."""
    print(f"Analyzing category: {category_name}")

    # Basic Statistics
    basic_stats = calculate_basic_stats(data)
    print("Basic Statistics:", basic_stats)

    # Common Words in Reviews and Summaries
    common_words_reviews = get_most_common_words(data['reviewText'])
    common_words_summaries = get_most_common_words(data['summary'])
    print("Common Words in Reviews:", common_words_reviews)
    print("Common Words in Summaries:", common_words_summaries)

    # Distribution of Ratings
    rating_counts = data['overall'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=rating_counts.index, y=rating_counts.values)
    plt.title(f"Distribution of Ratings in {category_name}")
    plt.xlabel('Ratings')
    plt.ylabel('Count')
    plt.show()

    # Review Length Analysis
    data['review_length'] = data['reviewText'].str.len()
    plt.figure(figsize=(10, 6))
    sns.histplot(data['review_length'], bins=50, kde=True)
    plt.title(f"Distribution of Review Lengths in {category_name}")
    plt.xlabel('Review Length (Characters)')
    plt.ylabel('Frequency')
    plt.show()

    # Analysis for each rating
    for rating in sorted(data['overall'].unique()):
        analyze_ratings(data[data['overall'] == rating], rating)

def analyze_ratings(data, rating):
    """Function to perform analysis specific to each rating."""
    print(f"Analyzing Rating: {rating}")

    # Word Cloud for each rating
    words = ' '.join([text for text in data['reviewText'].dropna()])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(words)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Word Cloud for Rating {rating}")
    plt.axis('off')
    plt.show()

    # Sentiment Analysis
    data['sentiment'] = data['reviewText'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    plt.figure(figsize=(10, 6))
    sns.histplot(data['sentiment'], bins=50, kde=True)
    plt.title(f"Sentiment Distribution for Rating {rating}")
    plt.xlabel('Sentiment Polarity')
    plt.ylabel('Frequency')
    plt.show()

# Load the datasets (replace with the correct file paths)
df_clothing = load_dataset('/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/Clothing_Shoes_and_Jewelry.json')
df_beauty = load_dataset('/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/All_Beauty.json')

#Perform EDA on each category
analyze_category(df_clothing, "Clothing, Shoes and Jewelry")
analyze_category(df_beauty, "All Beauty")



ModuleNotFoundError: No module named 'seaborn'