In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from CIP_crawling import crawl_for_links, crawl_for_product_data, data_to_csv
# from clean_data import ...
from CIP_analysis import load_old, load_new, inspecting_outputs, onehot_encoding, price_efficiency, feature_rating, price_efficiency_features

ModuleNotFoundError: No module named 'distutils'

## Data Crawling

In [3]:
# Get product URLs from the search results page
urls = crawl_for_links(url="https://www.galaxus.ch/en/s2/producttype/robot-vacuum-cleaners-174?take=204")

# Scrape all product data and store it in a dictionary
data = crawl_for_product_data(urls[:10])        # [:10] is for testing only. Scraping one product takes ~4 seconds, so scraping ~500 products takes about 40 minutes.

# Save the data to a .csv file
data_to_csv(data, save=False)

NameError: name 'crawl_for_links' is not defined

## Data Cleaning

In [4]:
"""
This part of the script used clean_data.py module to process robot vacuum data.
The script imports functions from clean_data.py and executes them to:
1. Clean the robot vacuum dataset
2. Create a detailed analysis report 
3. Output files ready for further analysis and plotting
"""

# Import functions from clean_data module
from clean_data import clean_data, generate_report, process_data, get_cleaned_data

print("## Data Cleaning")
print("-" * 50)

# Method 1: Using the complete pipeline
print("\n1. Using process_data() for complete pipeline:")
original_df, cleaned_df, report = process_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv',
    report_file='Vacuum robots info summary.txt',
    verbose=True
)

# Method 2: Using individual functions step by step
print("\n2. Using individual functions step-by-step:")
print("Step 1: Cleaning the data")
df_original, df_cleaned = clean_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv'
)

print("\nStep 2: Creating the report")
report_lines = generate_report(
    df_cleaned=df_cleaned,
    output_file='Vacuum robots info summary.txt'
)

# Method 3: Quick access to cleaned data for analysis
print("\n3. Quick access using get_cleaned_data():")
quick_df = get_cleaned_data(force_clean=False)
print(f"Shape of quickly accessed data: {quick_df.shape}")
print(f"Columns: {quick_df.columns[:5]}...")

print("\n## Ready for Data Analysis")
print("-" * 50)
print("The cleaned data is now ready for analysis!")
print("You can now proceed with the Data Analysis section using the following file:")
print("- robot_vacuums_cleaned.csv")
print("\nSample code for analysis:")
print("df_cleaned = pd.read_csv('robot_vacuums_cleaned.csv')")
print("# Perform your analysis here...") 

## Data Cleaning
--------------------------------------------------

1. Using process_data() for complete pipeline:

Data completeness filtering:
Rows removed due to having more than 15 empty elements: 3
Rows remaining: 468

Data cleaning completed. Cleaned data saved to 'robot_vacuums_cleaned.csv'
Original shape: (471, 60)
Cleaned shape: (468, 25)

Data Statistics:
Number of unique manufacturers: 91
Price range: CHF100.00 - CHF2990.00
Average rating: 4.17
Average battery life: 169 minutes
Most common colours: {'Black': 170, 'White': 159, '': 21}
Battery capacity range: 1200 - 8000 mAh

Generating summary report...
Summary report has been created and saved as 'Vacuum robots info summary.txt'

2. Using individual functions step-by-step:
Step 1: Cleaning the data

Data completeness filtering:
Rows removed due to having more than 15 empty elements: 3
Rows remaining: 468

Data cleaning completed. Cleaned data saved to 'robot_vacuums_cleaned.csv'
Original shape: (471, 60)
Cleaned shape: (46

## EDA

In [5]:
"""
This part of the script used Vacuum_EDA.py to plot some selected plots
This script shows three different ways to use the Vacuum_EDA module:
1. Default execution - Running the full EDA pipeline with default parameters
2. Custom execution - Specifying custom input, output files and plots directory
3. Selective execution - Running only specific analysis functions

Usage:
    python execute_vacuum_eda.py
"""

import os
import sys
import pandas as pd
from Vacuum_EDA import (
    run_eda_analysis,
    add_derived_columns,
    plot_price_distribution,
    plot_country_distribution,
    plot_battery_vs_price
)

def print_section(title):
    """Print a section title with separators for better readability"""
    print("\n" + "="*80)
    print(f" {title} ".center(78, "="))
    print("="*80 + "\n")

def main():
    # input file to use the cleaned data file
    input_file = 'robot_vacuums_cleaned.csv'
    if not os.path.exists(input_file):
        print(f"Error: {input_file} not found. Please run clean_data.py first.")
        sys.exit(1)

    # EXAMPLE 1: Default execution - simplest way to run the full analysis
    print_section("EXAMPLE 1: Default Execution")
    print("Running the complete EDA analysis with default parameters...\n")
    print("This will:")
    print(" - Use 'robot_vacuums_cleaned.csv' as input")
    print(" - Save plots to the 'plots' directory")
    print(" - Append EDA results to 'Vacuum robots info summary.txt'")
    print("\nExecuting...\n")
    
    # Default execution
    run_eda_analysis()
    
    # EXAMPLE 2: Custom execution - specifying parameters
    print_section("EXAMPLE 2: Custom Execution")
    print("Running the EDA analysis with custom parameters...\n")
    
    # Create a custom plots directory
    custom_plots_dir = 'custom_plots'
    custom_report = 'Custom_EDA_Report.txt'
    
    print(f"This will:")
    print(f" - Use '{input_file}' as input")
    print(f" - Save plots to the '{custom_plots_dir}' directory")
    print(f" - Save report to '{custom_report}'")
    print("\nExecuting...\n")
    
    # Custom execution
    run_eda_analysis(
        input_file=input_file,
        plots_dir=custom_plots_dir,
        report_file=custom_report
    )
    
    # EXAMPLE 3: Selective execution - running only specific analyses
    print_section("EXAMPLE 3: Selective Execution")
    print("Running only selected analysis functions...\n")
    
    selective_plots_dir = 'selective_plots'
    if not os.path.exists(selective_plots_dir):
        os.makedirs(selective_plots_dir)
    
    # Load and prepare data
    print("Loading and preparing data...")
    df = pd.read_csv(input_file)
    df_enriched = add_derived_columns(df)
    
    # Run selected analyses
    print("Generating selected plots:")
    print(" - Price distribution")
    plot_price_distribution(df_enriched, save_dir=selective_plots_dir)
    
    print(" - Country distribution")
    plot_country_distribution(df_enriched, save_dir=selective_plots_dir)
    
    print(" - Battery capacity vs price")
    plot_battery_vs_price(df_enriched, save_dir=selective_plots_dir)
    
    print(f"\nSelective plots have been saved to the '{selective_plots_dir}' directory.")

if __name__ == "__main__":
    main() 



Running the complete EDA analysis with default parameters...

This will:
 - Use 'robot_vacuums_cleaned.csv' as input
 - Save plots to the 'plots' directory
 - Append EDA results to 'Vacuum robots info summary.txt'

Executing...


Summary Statistics:

Price Categories Distribution:
price_category
Mid-range (CHF 200-500)    52.8%
Premium (CHF 500-1000)     26.5%
Budget (< CHF 200)         11.8%
Luxury (> CHF 1000)         9.0%
Name: proportion, dtype: object

Country of Origin Distribution:
country
Other          57.7%
China          29.7%
USA             8.8%
Netherlands     2.4%
South Korea     1.1%
Germany         0.4%
Name: proportion, dtype: object

Average Price by Country:
country
South Korea    1037.21
China           667.53
Germany         528.50
Other           496.72
USA             487.98
Netherlands     446.54
Name: price, dtype: float64

Battery Statistics:
Average Battery Capacity: 4038 mAh
Average Battery Life: 169 minutes

Detailed summary has been added to the report 

## Data Analysis

In [None]:
# Load the cleaned CSV file
df_cleaned = load_new(new_csv="robot_vacuums_cleaned.csv", print_i=False)

# Create one-hot encodings from the features
df_onehot = onehot_encoding(df_cleaned, print_i=False)

# Calculate the price-efficiency of the products and return the top X products for each category
price_efficiency(df_onehot, top=5)

# Analyze the influence of features on product ratings and display those with the lowest p-values
feature_rating(df_onehot)

# Price 
price_efficiency_features(df_onehot)