In [3]:
# pip install -r requirements.txt

In [4]:
from CIP_crawling import crawl_for_links, crawl_for_product_data, data_to_csv
from clean_data import process_data
from Vacuum_EDA import eda_default_execution, eda_custom_execution, eda_selective_execution
from CIP_analysis import load_new, onehot_encoding, price_efficiency, feature_rating, price_efficiency_features

## Data Crawling

In [None]:
# Get product URLs from the search results page
urls = crawl_for_links(url="https://www.galaxus.ch/en/s2/producttype/robot-vacuum-cleaners-174?take=100",
                       user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

# Scrape all product data and store it in a dictionary
# [:5] is for testing only. Scraping one product takes ~4 seconds, so scraping ~500 products takes about 40 minutes.
data = crawl_for_product_data(urls[:5],
                              user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

# Save the data to a .csv file
data_to_csv(data, save=False)

## Data Cleaning

In [5]:
# Process and clean the robot vacuum data
df_original, df_cleaned, report = process_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv',
    report_file='Vacuum robots info summary.txt'
)


Data completeness filtering:
Rows removed due to having more than 15 empty elements: 3
Rows remaining: 468

Data cleaning completed. Cleaned data saved to 'robot_vacuums_cleaned.csv'
Original shape: (471, 60)
Cleaned shape: (468, 25)

Data Statistics:
Number of unique manufacturers: 91
Price range: CHF100.00 - CHF2990.00
Average rating: 4.17
Average battery life: 169 minutes
Most common colours: {'Black': 170, 'White': 159, '': 21}
Battery capacity range: 1200 - 8000 mAh

Generating summary report...
Summary report has been created and saved as 'Vacuum robots info summary.txt'


## EDA

In [6]:
# With this functions we can make the different variants of evaluation to get a impression of the data
# It is not mandatory for the process

# Running the full EDA pipeline with default parameters
eda_default_execution(input_df='robot_vacuums_cleaned.csv', save_plot=False)

# Specifying custom input, output files and plots directory
eda_custom_execution(input_df='robot_vacuums_cleaned.csv', save_plot=False)

# Running only specific analysis functions
eda_selective_execution(input_file='robot_vacuums_cleaned.csv', save_plot=False)  # Note: this function uses input_file, not input_df



Running the complete EDA analysis with default parameters...

This will:
 - Use 'robot_vacuums_cleaned.csv' as input
 - Save plots to the 'plots' directory
 - Append EDA results to 'Vacuum robots info summary.txt'

Executing...


Summary Statistics:

Price Categories Distribution:
price_category
Mid-range (CHF 200-500)    52.8%
Premium (CHF 500-1000)     26.5%
Budget (< CHF 200)         11.8%
Luxury (> CHF 1000)         9.0%
Name: proportion, dtype: object

Country of Origin Distribution:
country
Other          57.7%
China          29.7%
USA             8.8%
Netherlands     2.4%
South Korea     1.1%
Germany         0.4%
Name: proportion, dtype: object

Average Price by Country:
country
South Korea    1037.21
China           667.53
Germany         528.50
Other           496.72
USA             487.98
Netherlands     446.54
Name: price, dtype: float64

Battery Statistics:
Average Battery Capacity: 4038 mAh
Average Battery Life: 169 minutes

Detailed summary has been added to the report 

## Data Analysis

In [None]:
# Load the cleaned CSV file
df_cleaned = load_new(new_csv="robot_vacuums_cleaned.csv", print_i=False)

# Create one-hot encodings from the features
df_onehot = onehot_encoding(df_cleaned, print_i=False)

# Calculate the price-efficiency of the products and return the top X products for each category
price_efficiency(df_onehot, top=5, price_rel=50)

# Analyze the influence of features on product ratings and display those with the lowest p-values
feature_rating(df_onehot)

# Price and Rating per feature ranking
price_efficiency_features(df_onehot, print_i=True)