In [None]:
# pip install -r requirements.txt

In [None]:
from CIP_crawling import crawl_for_links, crawl_for_product_data, data_to_csv
from clean_data import clean_data, generate_report, process_data, get_cleaned_data
from Vacuum_EDA import eda_default_execution, eda_selective_execution, eda_custom_execution
from CIP_analysis import load_old, load_new, inspecting_outputs, onehot_encoding, price_efficiency, feature_rating, price_efficiency_features

## Data Crawling

In [None]:
# Get product URLs from the search results page
urls = crawl_for_links(url="https://www.galaxus.ch/en/s2/producttype/robot-vacuum-cleaners-174?take=204")

# Scrape all product data and store it in a dictionary
data = crawl_for_product_data(urls[:5])        # [:10] is for testing only. Scraping one product takes ~4 seconds, so scraping ~500 products takes about 40 minutes.

# Save the data to a .csv file
data_to_csv(data, save=False)

## Data Cleaning

In [3]:
# Import the necessary function from clean_data module
from clean_data import process_data
# Process and clean the robot vacuum data
df_original, df_cleaned, report = process_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv',
    report_file='Vacuum robots info summary.txt'
)


Data completeness filtering:
Rows removed due to having more than 15 empty elements: 3
Rows remaining: 468

Data cleaning completed. Cleaned data saved to 'robot_vacuums_cleaned.csv'
Original shape: (471, 60)
Cleaned shape: (468, 25)

Data Statistics:
Number of unique manufacturers: 91
Price range: CHF100.00 - CHF2990.00
Average rating: 4.17
Average battery life: 169 minutes
Most common colours: {'Black': 170, 'White': 159, '': 21}
Battery capacity range: 1200 - 8000 mAh

Generating summary report...
Summary report has been created and saved as 'Vacuum robots info summary.txt'


## EDA

In [None]:
# With this functions we can make the different variants of evaluation to get a impression of the data
# It is not mandatory for the process

# Running the full EDA pipeline with default parameters
eda_default_execution(input_df='robot_vacuums_cleaned.csv', save=False)

# Specifying custom input, output files and plots directory
eda_custom_execution(input_df='robot_vacuums_cleaned.csv', save=False)

# Running only specific analysis functions
eda_selective_execution(input_df='robot_vacuums_cleaned.csv', save=False)

## Data Analysis

In [None]:
# Load the cleaned CSV file
df_cleaned = load_new(new_csv="robot_vacuums_cleaned.csv", print_i=False)

# Create one-hot encodings from the features
df_onehot = onehot_encoding(df_cleaned, print_i=False)

# Calculate the price-efficiency of the products and return the top X products for each category
price_efficiency(df_onehot, top=5)

# Analyze the influence of features on product ratings and display those with the lowest p-values
feature_rating(df_onehot)

# Price and Rating per feature ranking
price_efficiency_features(df_onehot, print_i=True)