In [None]:
import pandas as pd
import statsmodels.formula.api as smf

## Inspecting the Data

In [None]:
df = pd.read_csv("robot_vacuums.csv")
shape = df.shape
print(shape)
for titel in df.keys():
    count_na = df[titel].isna().sum()
    print(f"{round(count_na * 100 / shape[0], 1)}% of {titel} are NA-values")

In [None]:
df = pd.read_csv("robot_vacuums_cleaned.csv")
shape = df.shape
print(shape)
for titel in df.keys():
    count_na = df[titel].isna().sum()
    print(f"{round(count_na * 100 / shape[0], 1)}% of {titel} are NA-values")

In [None]:
# inspecting
[print(feature)for feature in df["features"][0:5]]
print("\n")
[print(feature)for feature in df["robot_type"][0:5]]
print("\n")
[print(feature)for feature in df["battery_life"][0:5]]
print("\n")
[print(feature)for feature in df["noise_level"][0:5]]
print("\n")
[print(feature)for feature in df["suction_power"][0:5]]
print("\n")
[print(feature)for feature in df["smart_home_ecosystem"][0:5]]

## One hot encoding der Features

In [None]:
df = pd.read_csv("robot_vacuums_cleaned.csv")

# creating the one-hot encoding for every feature
# if feature is Na, it should be detectable
df["features_clean"] = df["features"].fillna("missing")
onehot = df["features_clean"].str.get_dummies(sep="|")
df_onehot = pd.concat([df, onehot], axis=1)
# for some librarys it's not possible to work with these
df_onehot.columns = df_onehot.columns.str.replace(" ", "_")
df_onehot.columns = df_onehot.columns.str.replace("-", "_")

for titel in df_onehot.keys():
    print(f"{titel} + ", end="")

## Performance and Price

In [None]:
print(df["robot_type"].value_counts(), "\n")

robot_type = df["robot_type"].unique()

for ro_type in df["robot_type"].unique()[:3]:
    df_filtered = df[df["robot_type"] == ro_type].copy()

    # make a quality score

    # get soome values important values at a quantile of 80%
    quantile = 0.8
    battery_qual = (df_filtered["battery_life"] / df_filtered["battery_life"].quantile(quantile)).clip(upper=1)
    noise_qual = (df_filtered["noise_level"] / df_filtered["noise_level"].quantile(quantile)).clip(upper=1)
    suction_qual = (df_filtered["suction_power"] / df_filtered["suction_power"].quantile(quantile)).clip(upper=1)
    room_qual = (df_filtered["room_area"] / df_filtered["room_area"].quantile(quantile)).clip(upper=1)

    # created a quality score over multiple value scores
    df_filtered["quality_score"] = pd.concat([battery_qual, noise_qual, suction_qual, room_qual], axis=1).mean(axis=1, skipna=True)

    # 300 are just because many prices are about 1000.- and a score from 0 - 100 is better to read than a verry small number
    # price ** 1/5 is to reduce the price influence on the result. Most good products are still verry cheap
    df_filtered["price_efficiency"] = df_filtered["quality_score"] / df_filtered["price"]**(1/5) * 300

    print(f"for the {ro_type} the ranking is:")
    
    top10 = df_filtered.sort_values("price_efficiency", ascending=False).head(10)
    print(top10[["product_name", "price_efficiency", "price"]].round(2))
    print("\n")

## Featrure Rating comparison

In [None]:
# ca 200 left
df_rating = df_onehot[df_onehot["rating_count"] >= 10]

model = smf.ols(formula="rating ~ battery_life + noise_level + suction_power + room_area + Area_cleaning + Automatic_detergent_addition + Automatic_dust_emptying + Automatic_mop_pad_separation + Automatic_power_adjustment + Automatic_water_refill + Automatic_water_regulation + Base_station + Camera_function + Camera_based_navigation + Carpet_detection + Configurable_cleaning_programmes + Extendable_side_brushes + Extendable_wiping_pads + Fixed_water_connection + Independent_emptying + Infrared_sensor + Laser_navigation + Obstacle_detector + Programmable_cleaning_schedules + Programmable_room_boundary + Removable_water_tank + Self_cleaning_mop_pads + Staircase_safe", data=df_rating).fit()
# Ergebnisse extrahieren
results_df = pd.DataFrame({
    "coef": model.params,
    "std_err": model.bse,
    "t": model.tvalues,
    "p_value": model.pvalues
})

# Nach p-Wert sortieren
sorted_results = results_df.sort_values("p_value")

# Anzeigen
print("top features with influence on the ratings with p < 0,1")
print(sorted_results[sorted_results["p_value"] <= 0.1].round(3))

In [None]:
print(model.summary())

## Price per Feature

In [None]:
import numpy as np

In [None]:
# ca 200 left
df_price = df_onehot[df_onehot["rating_count"] >= 10]
print("sample size before cleaning ", df_price.shape[0])
print("price max ", df_price["price"].max())
print("price mean ", df_price["price"].mean())
print("price median ", df_price["price"].median())

# there are some realy high price products, they are eliminated
df_price = df_price[df_price["price"] <= 2000]
print("sample size after cleaning ", df_price.shape[0])
print("price mean ", df_price["price"].mean())
print("price median ", df_price["price"].median())

In [None]:
formula = (
    "battery_life + noise_level + suction_power + room_area + "
    "Area_cleaning + Automatic_detergent_addition + Automatic_dust_emptying + "
    "Automatic_mop_pad_separation + Automatic_power_adjustment + Automatic_water_refill + "
    "Automatic_water_regulation + Base_station + Camera_function + Camera_based_navigation + "
    "Carpet_detection + Configurable_cleaning_programmes + Extendable_side_brushes + "
    "Extendable_wiping_pads + Fixed_water_connection + Independent_emptying + "
    "Infrared_sensor + Laser_navigation + Obstacle_detector + Programmable_cleaning_schedules + "
    "Programmable_room_boundary + Removable_water_tank + Self_cleaning_mop_pads + Staircase_safe")

# PRICE MODEL
model_price = smf.ols(formula=f"price ~ {formula}", data=df_price).fit()
results_price = pd.DataFrame({
    "coef_price": model_price.params,
    "p_price": model_price.pvalues})

# RATING MODEL
model_rating = smf.ols(formula=f"rating ~ {formula}", data=df_price).fit()
results_rating = pd.DataFrame({
    "coef_rating": model_rating.params,
    "p_rating": model_rating.pvalues})

# Nach p-Wert sortieren
sorted_results = results_df.sort_values("p_value")

# Combine and drop intercept
results = results_price.join(results_rating, how="inner")
results = results.drop(index="Intercept")

# Compute scores
results["score_price"] = - results["coef_price"] * -np.log10(results["p_price"])
results["score_rating"] = results["coef_rating"] * -np.log10(results["p_rating"])

# Normalize scores between 0 and 1
results["score_price_norm"] = (results["score_price"] - results["score_price"].min()) / (results["score_price"].max() - results["score_price"].min())
results["score_rating_norm"] = (results["score_rating"] - results["score_rating"].min()) / (results["score_rating"].max() - results["score_rating"].min())

# Combined score (average)
results["combined_score"] = (results["score_price_norm"] + results["score_rating_norm"]) / 2

# Sort by combined_score
results_sorted = results.sort_values("combined_score", ascending=False)

print("\n📊 Top features by combined influence on price and rating:\n")
print(results_sorted[["combined_score", "coef_price", "coef_rating"]][:10].round(3))

In [None]:
print(model.summary())