In [None]:
pip install -r requirements.txt

In [None]:
from CIP_crawling import crawl_for_links, crawl_for_product_data, data_to_csv
from clean_data import process_data
from Vacuum_EDA import eda_default_execution, eda_custom_execution, eda_selective_execution
from CIP_analysis import load_new, onehot_encoding, price_efficiency, feature_rating, price_efficiency_features

## Data Crawling

In [None]:
# Get product URLs from the search results page
urls = crawl_for_links(url="https://www.galaxus.ch/en/s2/producttype/robot-vacuum-cleaners-174?take=100",
                       user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

# Scrape all product data and store it in a dictionary
# [:5] is for testing only. Scraping one product takes approximately 4 seconds, so scraping around 500 products takes about 40 minutes.
data = crawl_for_product_data(urls[:5],
                              user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

# Save the data to a .csv file
data_to_csv(data, save=False)

## Data Cleaning

In [None]:
# Process and clean the robot vacuum data
df_original, df_cleaned, report = process_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv',
    report_file='Vacuum robots info summary.txt'
)

## EDA

In [None]:
# These functions allow us to perform different types of evaluations to gain insights from the data.
# They are optional and not required for the main process.

# Run the full EDA pipeline with default parameters
eda_default_execution(input_df='robot_vacuums_cleaned.csv', save_plot=False)

# Specify custom input and output files, and a directory for plots
eda_custom_execution(input_df='robot_vacuums_cleaned.csv', save_plot=False)

# Run only selected analysis functions
eda_selective_execution(input_file='robot_vacuums_cleaned.csv', save_plot=False)  # Note: this function uses input_file, not input_df


## Data Analysis

In [None]:
# Load the cleaned CSV file
df_cleaned = load_new(new_csv="robot_vacuums_cleaned.csv", print_i=False)

# Create one-hot encodings from the features
df_onehot = onehot_encoding(df_cleaned, print_i=False)

# Calculate the price-efficiency of the products and return the top X products in each category
price_efficiency(df_onehot, top=5, price_rel=50)

# Analyze the influence of features on product ratings and display those with the lowest p-values
feature_rating(df_onehot)

# Rank features based on price and rating
price_efficiency_features(df_onehot, print_i=True)