se chunk 2, den første crsahede ret sent for mig 

In [1]:
import pandas as pd
import json
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import numpy as np

# Function to load the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    return pd.DataFrame(data)

# Function to calculate basic statistics
def calculate_basic_stats(df):
    return {
        "Total Reviews": len(df),
        "Average Rating": df['overall'].mean(),
        "Average Review Length": df['reviewText'].str.len().mean(),
        "Average Summary Length": df['summary'].str.len().mean()
    }

# Function to find the most common words
def get_most_common_words(text_series, num_words=10):
    combined_text = ' '.join(text_series.dropna())
    words = re.findall(r'\b[a-z]+\b', combined_text.lower())
    return Counter(words).most_common(num_words)

def analyze_category(data, category_name):
    """Function to perform EDA on a given category."""
    print(f"Analyzing category: {category_name}")

    # Basic Statistics
    basic_stats = calculate_basic_stats(data)
    print("Basic Statistics:", basic_stats)

    # Common Words in Reviews and Summaries
    common_words_reviews = get_most_common_words(data['reviewText'])
    common_words_summaries = get_most_common_words(data['summary'])
    print("Common Words in Reviews:", common_words_reviews)
    print("Common Words in Summaries:", common_words_summaries)

    # Distribution of Ratings
    rating_counts = data['overall'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=rating_counts.index, y=rating_counts.values)
    plt.title(f"Distribution of Ratings in {category_name}")
    plt.xlabel('Ratings')
    plt.ylabel('Count')
    plt.show()

    # Review Length Analysis
    data['review_length'] = data['reviewText'].str.len()
    plt.figure(figsize=(10, 6))
    sns.histplot(data['review_length'], bins=50, kde=True)
    plt.title(f"Distribution of Review Lengths in {category_name}")
    plt.xlabel('Review Length (Characters)')
    plt.ylabel('Frequency')
    plt.show()

    # Analysis for each rating
    for rating in sorted(data['overall'].unique()):
        analyze_ratings(data[data['overall'] == rating], rating)

def analyze_ratings(data, rating):
    """Function to perform analysis specific to each rating."""
    print(f"Analyzing Rating: {rating}")

    # Word Cloud for each rating
    words = ' '.join([text for text in data['reviewText'].dropna()])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(words)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Word Cloud for Rating {rating}")
    plt.axis('off')
    plt.show()

    # Sentiment Analysis
    data['sentiment'] = data['reviewText'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    plt.figure(figsize=(10, 6))
    sns.histplot(data['sentiment'], bins=50, kde=True)
    plt.title(f"Sentiment Distribution for Rating {rating}")
    plt.xlabel('Sentiment Polarity')
    plt.ylabel('Frequency')
    plt.show()

# Load the datasets (replace with the correct file paths)
df_clothing = load_dataset('/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/Clothing_Shoes_and_Jewelry.json')
df_beauty = load_dataset('/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/All_Beauty.json')

#Perform EDA on each category
analyze_category(df_clothing, "Clothing, Shoes and Jewelry")
analyze_category(df_beauty, "All Beauty")



Analyzing category: Clothing, Shoes and Jewelry
Basic Statistics: {'Total Reviews': 32292099, 'Average Rating': 4.189458449263394, 'Average Review Length': 158.12622289071717, 'Average Summary Length': 21.565853695260873}


: 

In [None]:
den her er opdateret EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import json

def load_sample_dataset(file_path, sample_size):
    """ Load a sample of the dataset """
    try:
        df = pd.read_json(file_path, lines=True)
        return df.sample(n=min(sample_size, len(df)))
    except Exception as e:
        print(f"Error loading file: {e}")
        return pd.DataFrame()

def plot_rating_distribution(df, category_name):
    """ Plot the distribution of ratings """
    plt.figure(figsize=(10, 6))
    sns.countplot(x='overall', data=df)
    plt.title(f"Rating Distribution in {category_name}")
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()

def plot_review_length_distribution(df, category_name, column_name):
    """ Plot the distribution of review lengths """
    df['review_length'] = df[column_name].apply(lambda x: len(str(x).split()))
    plt.figure(figsize=(10, 6))
    sns.histplot(df['review_length'], bins=50)
    plt.title(f"Review Length Distribution in {category_name}")
    plt.xlabel('Length of Review (Words)')
    plt.ylabel('Count')
    plt.show()

def display_word_cloud(df, column, title):
    """ Display a word cloud for a specific column """
    text = ' '.join(df[column].dropna().astype(str))
    wordcloud = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

# Define the file paths and sample sizes for each category
categories = {
    'AMAZON_FASHION': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/AMAZON_FASHION.json',
    'Clothing_Shoes_and_Jewelry': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Clothing_Shoes_and_Jewelry.json',
    'All_Beauty': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/All_Beauty.json',
    'Appliances': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Appliances.json',
    'Toys_and_Games': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Toys_and_Games.json',
    'Arts_Crafts_and_Sewing': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Arts_Crafts_and_Sewing.json',
    'Grocery_and_Gourmet_Food': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Grocery_and_Gourmet_Food.json',
    'Tools_and_Home_Improvement': '/Users/patrickbendorffschwebel/Desktop/Anvendt-kode/anvendt-exam/mappe uden navn 2/Tools_and_Home_Improvement.json'
}
sample_size = 5000  # Adjust as needed

for category, path in categories.items():
    df = load_sample_dataset(path, sample_size)
    if not df.empty:
        plot_rating_distribution(df, category)
        plot_review_length_distribution(df, category, 'reviewText')
        plot_review_length_distribution(df, category, 'summary')
        display_word_cloud(df, 'reviewText', f"Common Words in {category} Reviews")
        display_word_cloud(df, 'summary', f"Common Words in {category} Summaries")
