In [2]:
# STEP 1: Upload and extract ZIP
from google.colab import files
uploaded = files.upload()  # Upload 'datasheet.zip'

import zipfile, os
with zipfile.ZipFile("datasheet.zip", 'r') as zip_ref:
    zip_ref.extractall("bluebook_data")

# STEP 2: Check files (optional)
for root, dirs, files in os.walk("bluebook_data"):
    for file in files:
        print(os.path.join(root, file))

# STEP 3: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# STEP 4: Load data
train = pd.read_csv('/content/bluebook_data/bluebook-for-bulldozers/Train.csv', low_memory=False)
test = pd.read_csv('/content/bluebook_data/bluebook-for-bulldozers/Test.csv', low_memory=False)

# STEP 5: Drop columns with too many nulls
drop_cols = train.columns[train.isnull().mean() > 0.5]
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True, errors='ignore')

# STEP 6: Fill remaining nulls
for col in train.columns:
    if train[col].dtype == 'object':
        train[col] = train[col].fillna(train[col].mode()[0])
    else:
        train[col] = train[col].fillna(train[col].median())

for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna(test[col].mode()[0])
    else:
        test[col] = test[col].fillna(test[col].median())

# STEP 7: Feature engineering from 'saledate'
train['saledate'] = pd.to_datetime(train['saledate'])
test['saledate'] = pd.to_datetime(test['saledate'])

train['saleYear'] = train['saledate'].dt.year
train['saleMonth'] = train['saledate'].dt.month
test['saleYear'] = test['saledate'].dt.year
test['saleMonth'] = test['saledate'].dt.month

train.drop('saledate', axis=1, inplace=True)
test.drop('saledate', axis=1, inplace=True)

# STEP 8: Label encoding
cat_cols = train.select_dtypes(include='object').columns

for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    if col in test.columns:
        test[col] = le.transform(test[col].astype(str))

# STEP 9: Prepare data
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']
X_test = test[X.columns]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 10: Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
rf_rmsle = np.sqrt(mean_squared_log_error(y_val, rf_preds))
print(f"🌲 Random Forest RMSLE: {rf_rmsle:.4f}")

# STEP 11: Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)

# Clip negatives to zero to avoid log error
lr_preds = np.clip(lr_preds, 0, None)
lr_rmsle = np.sqrt(mean_squared_log_error(y_val, lr_preds))
print(f"📈 Linear Regression RMSLE: {lr_rmsle:.4f}")

# STEP 12: Pick best model
best_model = rf_model if rf_rmsle < lr_rmsle else lr_model
print("✅ Best model selected:", "Random Forest" if best_model == rf_model else "Linear Regression")

# STEP 13: Predict on test data
test_preds = best_model.predict(X_test)
test_preds = np.clip(test_preds, 0, None)  # Ensure no negative predictions

# STEP 14: Save final predictions
submission = pd.DataFrame({
    'SalesID': test['SalesID'],
    'SalePrice': test_preds
})
submission.to_csv('/content/test_predictions.csv', index=False)
print("📁 Final submission saved as test_predictions.csv")


Saving datasheet.zip to datasheet.zip
bluebook_data/bluebook-for-bulldozers/ValidSolution.csv
bluebook_data/bluebook-for-bulldozers/Valid.zip
bluebook_data/bluebook-for-bulldozers/test_predictions.csv
bluebook_data/bluebook-for-bulldozers/Valid.7z
bluebook_data/bluebook-for-bulldozers/train_tmp.csv
bluebook_data/bluebook-for-bulldozers/Data Dictionary.xlsx
bluebook_data/bluebook-for-bulldozers/Machine_Appendix.csv
bluebook_data/bluebook-for-bulldozers/random_forest_benchmark_test.csv
bluebook_data/bluebook-for-bulldozers/Train.7z
bluebook_data/bluebook-for-bulldozers/Valid.csv
bluebook_data/bluebook-for-bulldozers/median_benchmark.csv
bluebook_data/bluebook-for-bulldozers/TrainAndValid.csv
bluebook_data/bluebook-for-bulldozers/TrainAndValid.7z
bluebook_data/bluebook-for-bulldozers/Test.csv
bluebook_data/bluebook-for-bulldozers/TrainAndValid.zip
bluebook_data/bluebook-for-bulldozers/Train.zip
bluebook_data/bluebook-for-bulldozers/Train.csv
🌲 Random Forest RMSLE: 0.2285
📈 Linear Regressi