In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import json

def load_sample_dataset(file_path, sample_size, random_state=42):
    """ Load a sample of the dataset """
    try:
        df = pd.read_json(file_path, lines=True)
        return df.sample(n=min(sample_size, len(df)), random_state=random_state)
    except Exception as e:
        print(f"Error loading file: {e}")
        return pd.DataFrame()

def plot_rating_distribution(df, category_name):
    """ Plot the distribution of ratings """
    plt.figure(figsize=(10, 6))
    sns.countplot(x='overall', data=df)
    plt.title(f"Rating Distribution in {category_name}")
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()

def plot_review_length_distribution(df, category_name, column_name):
    """ Plot the distribution of review lengths """
    df['review_length'] = df[column_name].apply(lambda x: len(str(x).split()))
    plt.figure(figsize=(10, 6))
    sns.histplot(df['review_length'], bins=50)
    plt.title(f"Review Length Distribution in {category_name}")
    plt.xlabel('Length of Review (Words)')
    plt.ylabel('Count')
    plt.show()

def display_word_cloud(df, column, title):
    """ Display a word cloud for a specific column """
    text = ' '.join(df[column].dropna().astype(str))
    wordcloud = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

# Define the file paths and sample sizes for each category
categories = {
    #'AMAZON_FASHION': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/AMAZON_FASHION.json',
    'Clothing_Shoes_and_Jewelry': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Clothing_Shoes_and_Jewelry.json',
   # 'All_Beauty': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/All_Beauty.json',
   # 'Appliances': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Appliances.json',
   # 'Toys_and_Games': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Toys_and_Games.json',
   # 'Arts_Crafts_and_Sewing': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Arts_Crafts_and_Sewing.json',
   # 'Grocery_and_Gourmet_Food': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Grocery_and_Gourmet_Food.json',
   # 'Tools_and_Home_Improvement': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Tools_and_Home_Improvement.json'
}
sample_size = 50000  

for category, path in categories.items():
    df = load_sample_dataset(path, sample_size)
    if not df.empty:
        plot_rating_distribution(df, category)
        plot_review_length_distribution(df, category, 'reviewText')
        plot_review_length_distribution(df, category, 'summary')
        display_word_cloud(df, 'reviewText', f"Common Words in {category} Reviews")
        display_word_cloud(df, 'summary', f"Common Words in {category} Summaries")

