In [None]:
# pip install -r requirements.txt

In [None]:
from CIP_crawling import crawl_for_links, crawl_for_product_data, data_to_csv
from clean_data import clean_data, generate_report, process_data, get_cleaned_data
from Vacuum_EDA import eda_default_execution, eda_selective_execution, eda_custom_execution
from CIP_analysis import load_old, load_new, inspecting_outputs, onehot_encoding, price_efficiency, feature_rating, price_efficiency_features

## Data Crawling

In [None]:
# Get product URLs from the search results page
urls = crawl_for_links(url="https://www.galaxus.ch/en/s2/producttype/robot-vacuum-cleaners-174?take=204")

# Scrape all product data and store it in a dictionary
data = crawl_for_product_data(urls[:10])        # [:10] is for testing only. Scraping one product takes ~4 seconds, so scraping ~500 products takes about 40 minutes.

# Save the data to a .csv file
data_to_csv(data, save=False)

## Data Cleaning

In [None]:
"""
This part of the script used clean_data.py module to process robot vacuum data.
The script imports functions from clean_data.py and executes them to:
1. Clean the robot vacuum dataset
2. Create a detailed analysis report 
3. Output files ready for further analysis and plotting
"""

print("## Data Cleaning")
print("-" * 50)

# Method 1: Using the complete pipeline
print("\n1. Using process_data() for complete pipeline:")
original_df, cleaned_df, report = process_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv',
    report_file='Vacuum robots info summary.txt',
    verbose=True
)

# Method 2: Using individual functions step by step
print("\n2. Using individual functions step-by-step:")
print("Step 1: Cleaning the data")
df_original, df_cleaned = clean_data(
    input_file='robot_vacuums.csv',
    output_file='robot_vacuums_cleaned.csv'
)

print("\nStep 2: Creating the report")
report_lines = generate_report(
    df_cleaned=df_cleaned,
    output_file='Vacuum robots info summary.txt'
)

# Method 3: Quick access to cleaned data for analysis
print("\n3. Quick access using get_cleaned_data():")
quick_df = get_cleaned_data(force_clean=False)
print(f"Shape of quickly accessed data: {quick_df.shape}")
print(f"Columns: {quick_df.columns[:5]}...")

print("\n## Ready for Data Analysis")
print("-" * 50)
print("The cleaned data is now ready for analysis!")
print("You can now proceed with the Data Analysis section using the following file:")
print("- robot_vacuums_cleaned.csv")
print("\nSample code for analysis:")
print("df_cleaned = pd.read_csv('robot_vacuums_cleaned.csv')")
print("# Perform your analysis here...") 

## EDA

In [None]:
# With this functions we can make the different variants of evaluation to get a impression of the data

# Running the full EDA pipeline with default parameters
eda_default_execution()

# Specifying custom input, output files and plots directory
eda_custom_execution()

# Running only specific analysis functions
eda_selective_execution()

## Data Analysis

In [None]:
# Load the cleaned CSV file
df_cleaned = load_new(new_csv="robot_vacuums_cleaned.csv", print_i=False)

# Create one-hot encodings from the features
df_onehot = onehot_encoding(df_cleaned, print_i=False)

# Calculate the price-efficiency of the products and return the top X products for each category
price_efficiency(df_onehot, top=5)

# Analyze the influence of features on product ratings and display those with the lowest p-values
feature_rating(df_onehot)

# Price 
price_efficiency_features(df_onehot)