# LEGO Amazon Analysis (Medium Notebook)
This notebook loads the uploaded data files, performs basic cleaning and exploratory data analysis (EDA), and saves a set of images you can use in your presentation. It is designed to be beginner-friendly but also provide useful charts and insights for the project.

**Files used in this notebook**:
- `/mnt/data/geoMap.csv` (uploaded)
- `/mnt/data/toy-products-on-amazon-metadata.json` (uploaded)

If additional CSV data (e.g., the full Amazon dataset) is available, place it in the `/mnt/data` folder and re-run the cells.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path

print('pandas:', pd.__version__)
print('numpy:', np.__version__)
print('matplotlib:', plt.matplotlib.__version__)

In [None]:
# Paths to uploaded files
geo_path = '/mnt/data/geoMap.csv'
meta_json_path = '/mnt/data/toy-products-on-amazon-metadata.json'

print('Files present in /mnt/data:')
print(sorted([p.name for p in Path('/mnt/data').iterdir()]))

# Load geoMap if present
geo_df = None
if Path(geo_path).exists():
    geo_df = pd.read_csv(geo_path)
    print('\nLoaded geoMap.csv:')
    display(geo_df.head())
else:
    print('\ngeoMap.csv not found at', geo_path)

# Load metadata JSON (it describes the Amazon toy dataset fields)
meta = None
if Path(meta_json_path).exists():
    with open(meta_json_path, 'r', encoding='utf-8') as f:
        try:
            meta = json.load(f)
            print('\nLoaded toy-products metadata (showing top-level keys):', list(meta.keys())[:10])
        except Exception as e:
            print('Failed to load JSON metadata:', e)
else:
    print('\nMetadata JSON not found at', meta_json_path)

In [None]:
if geo_df is not None:
    print('\ngeoMap shape:', geo_df.shape)
    print('\ngeoMap info:')
    display(geo_df.info())
    print('\ngeoMap describe:')
    display(geo_df.describe(include='all').T)
else:
    print('No geo dataframe to inspect.')

In [None]:
if geo_df is not None:
    df = geo_df.copy()
    # Basic cleaning: strip column names
    df.columns = [c.strip() for c in df.columns]
    # Show null counts
    nulls = df.isnull().sum()
    print('\nNulls per column:')
    display(nulls[nulls>0])
    # Show first rows after cleaning
    display(df.head())
else:
    print('No geo dataframe to clean.')

In [None]:
output_dir = '/mnt/data/images'
Path(output_dir).mkdir(parents=True, exist_ok=True)

if 'df' in globals():
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print('Numeric columns detected:', numeric_cols)
    # Create one plot per numeric column
    for col in numeric_cols:
        try:
            plt.figure()
            df[col].dropna().hist()
            plt.title(f'Histogram of {col}')
            plt.xlabel(col)
            plt.ylabel('Count')
            fname = Path(output_dir)/f'hist_{col}.png'
            plt.savefig(fname, bbox_inches='tight')
            plt.close()
            print('Saved', fname)
        except Exception as e:
            print('Could not plot', col, e)
else:
    print('No dataframe "df" available for plotting.')

In [None]:
# Try to find any amazon toy CSV in /mnt/data
amazon_csv_candidates = [p for p in Path('/mnt/data').iterdir() if p.suffix.lower() in ('.csv', '.txt') and 'amazon' in p.name.lower()]
if amazon_csv_candidates:
    print('Found candidate files:', amazon_csv_candidates)
    amazon_path = str(amazon_csv_candidates[0])
    amazon_df = pd.read_csv(amazon_path, low_memory=False)
    print('\nLoaded', amazon_path, 'shape=', amazon_df.shape)
    display(amazon_df.head())
    # Basic cleaning: price column may be string
    if 'price' in amazon_df.columns:
        amazon_df['price_clean'] = amazon_df['price'].astype(str).str.replace('[^0-9\.]','', regex=True)
        amazon_df['price_clean'] = pd.to_numeric(amazon_df['price_clean'], errors='coerce')
        print('\nprice_clean stats:')
        display(amazon_df['price_clean'].describe())
    # Top manufacturers
    if 'manufacturer' in amazon_df.columns:
        print('\nTop manufacturers:')
        display(amazon_df['manufacturer'].value_counts().head(10))
else:
    print('No Amazon CSV file with "amazon" in the filename was found in /mnt/data. If you have the full dataset, upload it to /mnt/data and re-run this notebook.')

In [None]:
if 'df' in globals():
    num = df.select_dtypes(include=[np.number])
    if num.shape[1] >= 2:
        corr = num.corr()
        plt.figure()
        plt.imshow(corr, interpolation='nearest')
        plt.colorbar()
        plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
        plt.yticks(range(len(corr.columns)), corr.columns)
        plt.title('Correlation matrix (geoMap numeric columns)')
        plt.tight_layout()
        heatpath = Path(output_dir)/'correlation_matrix.png'
        plt.savefig(heatpath, bbox_inches='tight')
        plt.close()
        print('Saved correlation matrix to', heatpath)
    else:
        print('Not enough numeric columns for correlation heatmap.')
else:
    print('Dataframe df not available for correlation.')

In [None]:
if meta is not None:
    # Show available fields from the metadata JSON (recordSet -> field entries)
    try:
        record_sets = meta.get('recordSet', [])
        print('Number of record sets in metadata:', len(record_sets))
        for rs in record_sets[:1]:
            fields = rs.get('field', [])
            print('\nExample fields from metadata:')
            for f in fields[:20]:
                print('-', f.get('name'), ':', f.get('description'))
    except Exception as e:
        print('Error reading metadata JSON structure:', e)
else:
    print('No metadata JSON loaded.')

## Quick findings

- This notebook loaded the available uploaded files and produced basic EDA charts saved into `/mnt/data/images/`.
- If you upload the full Amazon toy CSV (`amazon_co-ecommerce_sample.csv` or similar) into `/mnt/data`, re-run the notebook to get richer analysis (price distribution across toys, top manufacturers, ratings impact, etc.).

### Next steps you can do:
1. Upload the Amazon CSV dataset to `/mnt/data` (file name containing 'amazon').
2. Re-run the notebook to create the detailed analysis and charts.
3. Use the saved images from `/mnt/data/images/` in your presentation slides.