In [1]:
import pandas as pd

# Define the path to your large CSV file
file_path = 'dataset/sets.csv'

df = pd.read_csv(file_path, sep=',')
df_cleaned = df.copy()

pd.set_option('display.max_rows', 3000)   # Show all rows
pd.set_option('display.max_columns', 20)  # Show all columns

df_cleaned.duplicated().sum()
df_cleaned.drop_duplicates(inplace=True)

df_cleaned['USD_MSRP'] = df_cleaned['USD_MSRP'].fillna(df['USD_MSRP'].median())
print("Missing values in USD_MSRP:", df_cleaned['USD_MSRP'].isna().sum())
#df_cleaned = df.dropna(subset=['USD_MSRP'])

df_cleaned.loc[
    df_cleaned['Name'].str.contains('Key Chain', case=False, na=False) & df_cleaned['Minifigures'].isna(),
    ['Minifigures', 'Pieces']
] = [1, 3]



# Display the entire DataFrame
print("Before cleaning:", df.shape)
print("After cleaning:", df_cleaned.shape)

display(df_cleaned)

Missing values in USD_MSRP: 0
Before cleaning: (14936, 17)
After cleaning: (14936, 17)


Unnamed: 0,Set_ID,Name,Year,Theme,Theme_Group,Subtheme,Category,Packaging,Num_Instructions,Availability,Pieces,Minifigures,Owned,Rating,USD_MSRP,Total_Quantity,Current_Price
0,75-1,PreSchool Set,1975,PreSchool,Pre-school,,Normal,{Not specified},0,{Not specified},16.0,,10.0,0.0,19.99,,
1,77-1,PreSchool Set,1975,PreSchool,Pre-school,,Normal,{Not specified},0,{Not specified},20.0,,11.0,0.0,19.99,,
2,077-1,Pre-School Set,1975,Duplo,Pre-school,,Normal,{Not specified},0,{Not specified},21.0,,10.0,0.0,19.99,0.0,
3,78-1,PreSchool Set,1975,PreSchool,Pre-school,,Normal,{Not specified},0,{Not specified},32.0,,8.0,0.0,19.99,,
4,78-3,Basic Set,1975,Samsonite,Vintage,Basic set,Normal,Box,0,{Not specified},330.0,,10.0,0.0,19.99,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14931,854242-1,Creeper Key Chain,2023,Gear,Miscellaneous,Key Chains/Minecraft,Gear,Tag,0,LEGO exclusive,3.0,1.0,2.0,0.0,5.99,,
14932,854243-1,Steve Key Chain,2023,Gear,Miscellaneous,Key Chains/Minecraft,Gear,Tag,0,LEGO exclusive,3.0,1.0,2.0,0.0,5.99,,
14933,854244-1,Piglin Key Chain,2023,Gear,Miscellaneous,Key Chains/Minecraft,Gear,Tag,0,LEGO exclusive,3.0,1.0,2.0,0.0,5.99,,
14934,854245-1,Fennec Shand Key Chain,2023,Gear,Miscellaneous,Key Chains/Star Wars,Gear,Tag,0,LEGO exclusive,3.0,1.0,88.0,0.0,5.99,,


In [2]:
df_known = df_cleaned[df_cleaned['USD_MSRP'].notna()]  # Rows with a price
df_unknown = df_cleaned[df_cleaned['USD_MSRP'].isna()]  # Rows with missing price

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Features and target
X_known = df_known[['Year', 'Pieces', 'Minifigures']]
y_known = df_known['USD_MSRP']

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X_known, y_known, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing values
X_unknown = df_unknown[['Year', 'Pieces', 'Minifigures']]

if not X_unknown.empty:
    df_cleaned.loc[df_cleaned['USD_MSRP'].isna(), 'USD_MSRP'] = model.predict(X_unknown)
else:
    print("No missing USD_MSRP values to predict.")

df_cleaned.to_csv(r'dataset/cleaned_lego_dataset.csv', index=False)

No missing USD_MSRP values to predict.


In [3]:
#from sklearn.model_selection import train_test_split
df_cleaned['Current_Price'] = df_cleaned['Current_Price'].fillna(df_cleaned['USD_MSRP'])

# Let's assume you want to predict USD_MSRP and you drop rows with missing target values (which you already did)
X = df_cleaned[['Year', 'Pieces', 'Minifigures', 'Theme', 'Subtheme', 'USD_MSRP']]  # add more features if needed
y = df_cleaned['Current_Price']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [4]:
print(y_train)

1157      19.9900
1383      19.9900
11287     19.9900
5151      50.0000
13266     22.4108
           ...   
5191      15.0000
13418    137.7900
5390      19.9900
860       19.9900
7270      19.9900
Name: Current_Price, Length: 12695, dtype: float64


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Define the numerical and categorical columns
numeric_features = ['Year', 'Pieces', 'Minifigures']
categorical_features = ['Theme', 'Subtheme']

# Create transformers for numerical data: imputation and scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create transformers for categorical data: imputation and one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    print(f"{name} RMSE: {rmse}")
    print(f"{name} R2: {r2}\n")

Linear Regression RMSE: 186.48591717030783
Linear Regression R2: 0.11535731405756566

Decision Tree RMSE: 194.9461704087183
Decision Tree R2: 0.033269933530921225

Random Forest RMSE: 181.57096438121306
Random Forest R2: 0.16137344521523322



In [6]:
# 60023 LEGO City Starter Set
city_starter_features = pd.DataFrame({
    'Year': [2013],
    'Pieces': [272],
    'Minifigures': [5],
    'Theme': ['City'],
    'Subtheme': ['Traffic']
})

# Predict 60023 LEGO City Starter Set
print("Current average 60023 LEGO City Starter Set price on ebay is ~$60")
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    pipeline.fit(X_train, y_train)
    city_starter_price = pipeline.predict(city_starter_features)[0]
    print(f"Predicted price for 60023 LEGO City Starter Set using {name}: {city_starter_price}")

Current average 60023 LEGO City Starter Set price on ebay is ~$60
Predicted price for 60023 LEGO City Starter Set using Linear Regression: 64.12248913338794
Predicted price for 60023 LEGO City Starter Set using Decision Tree: 49.99
Predicted price for 60023 LEGO City Starter Set using Random Forest: 52.3594719999999


In [None]:
# Sample LEGO sets
lego_sets = [
    {'Set_ID': '2865-1', 'Name': "Children's Zoo", 'Year': 1998, 'Pieces': 25, 'Minifigures': 2, 'Theme': 'Duplo', 'Subtheme': 'Pre-school'},
    {'Set_ID': '2872-1', 'Name': 'Witch and Fireplace', 'Year': 1997, 'Pieces': 19, 'Minifigures': 1, 'Theme': 'Castle', 'Subtheme': 'Fright Knights'},
    {'Set_ID': '7094-1', 'Name': "King's Castle Siege", 'Year': 2007, 'Pieces': 973, 'Minifigures': 10, 'Theme': 'Castle', 'Subtheme': 'Fantasy Era'},
    {'Set_ID': '40334-1', 'Name': 'Avengers Tower', 'Year': 2019, 'Pieces': 211, 'Minifigures': 1, 'Theme': 'Marvel Super Heroes', 'Subtheme': 'The Avengers'}
]

# Convert to DataFrame
lego_df = pd.DataFrame(lego_sets)

# Predict prices
for index, row in lego_df.iterrows():
    lego_features = pd.DataFrame({
        'Year': [row['Year']],
        'Pieces': [row['Pieces']],
        'Minifigures': [row['Minifigures']],
        'Theme': [row['Theme']],
        'Subtheme': [row['Subtheme']]
    })
    
    print(f"\n{'='*50}")
    print(f"Set: {row['Set_ID']} - {row['Name']}")
    print(f"Year: {row['Year']}, Theme: {row['Theme']}, Subtheme: {row['Subtheme']}")
    print(f"Pieces: {row['Pieces']}, Minifigures: {row['Minifigures']}")
    
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
        pipeline.fit(X_train, y_train)
        predicted_price = pipeline.predict(lego_features)[0]
        print(f"Predicted price using {name}: ${predicted_price:.2f}")
    print(f"{'='*50}\n")