In [None]:
import pandas as pd
import json
from collections import Counter
import re

# Function to load the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    return pd.DataFrame(data)

# Function to calculate basic statistics
def calculate_basic_stats(df):
    return {
        "Total Reviews": len(df),
        "Average Rating": df['overall'].mean()
    }

# Function to analyze review length
def analyze_review_length(df):
    df['review_length'] = df['reviewText'].str.len()
    df['summary_length'] = df['summary'].str.len()
    return {
        "Average Review Length": df['review_length'].mean(),
        "Average Summary Length": df['summary_length'].mean()
    }

# Function to find the most common words
def get_most_common_words(text_series, num_words=10):
    combined_text = ' '.join(text_series.dropna())
    words = re.findall(r'\b[a-z]+\b', combined_text.lower())
    return Counter(words).most_common(num_words)

# Load the Clothing_Shoes_and_Jewelry_5.json file
file_path_clothing = 'path_to_your_Clothing_Shoes_and_Jewelry_5_json_file'
df_clothing = load_dataset(file_path_clothing)

# Perform EDA
basic_stats_clothing = calculate_basic_stats(df_clothing)
review_length_stats_clothing = analyze_review_length(df_clothing)
common_words_in_reviews_clothing = get_most_common_words(df_clothing['reviewText'])
common_words_in_summaries_clothing = get_most_common_words(df_clothing['summary'])

# Print the results
print("Basic Statistics:", basic_stats_clothing)
print("Review Length Analysis:", review_length_stats_clothing)
print("Common Words in Reviews:", common_words_in_reviews_clothing)
print("Common Words in Summaries:", common_words_in_summaries_clothing)
