In [37]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LassoCV

train = pd.read_csv('/kaggle/input/holberton-baku-ml-2-housing-prices/train.csv')
test = pd.read_csv('/kaggle/input/holberton-baku-ml-2-housing-prices/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (1241, 81)
Test shape: (219, 80)


In [38]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import RobustScaler

# ==========================================
# PART 1: TRAIN XGBOOST (Tree Model)
# ==========================================
print("ðŸŒ² Training XGBoost...")

# 1. Clean Data for Trees (Simple, No Polynomials)
def clean_data_tree(df):
    df_clean = df.copy()
    # Basic Map
    quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
    for col in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('None').map(quality_map).fillna(0)
            
    # Fill NAs
    df_clean.fillna(0, inplace=True) # Trees handle 0s fine
    
    # Feature Engineering (Simple)
    if 'YearBuilt' in df_clean.columns:
        df_clean['HouseAge'] = 2010 - df_clean['YearBuilt']
    df_clean['TotalSF'] = df_clean['TotalBsmtSF'] + df_clean['1stFlrSF'] + df_clean['2ndFlrSF']
    df_clean['TotalBath'] = df_clean['FullBath'] + (0.5 * df_clean['HalfBath']) + df_clean['BsmtFullBath'] + (0.5 * df_clean['BsmtHalfBath'])
    
    return df_clean

# 2. Prepare & Train
train_tree = clean_data_tree(train)
test_tree = clean_data_tree(test)

train_tree = pd.get_dummies(train_tree)
test_tree = pd.get_dummies(test_tree)
train_tree, test_tree = train_tree.align(test_tree, join='left', axis=1)
test_tree = test_tree.fillna(0)

X_tree = train_tree.drop(['Id', 'SalePrice'], axis=1)
y_tree = np.log1p(train_tree['SalePrice'])
X_test_tree = test_tree.drop(['Id', 'SalePrice'], axis=1)

xgb = XGBRegressor(n_estimators=3000, learning_rate=0.01, max_depth=4, 
                   min_child_weight=1, gamma=0, subsample=0.7, 
                   colsample_bytree=0.7, n_jobs=-1, random_state=42)
xgb.fit(X_tree, y_tree)

# 3. Get Predictions (XGBoost)
log_xgb_preds = xgb.predict(X_test_tree)
xgb_preds = np.expm1(log_xgb_preds)
print("âœ… XGBoost Trained!")


# ==========================================
# PART 2: TRAIN LINEAR (ElasticNet)
# ==========================================
print("ðŸ“ˆ Training Linear Model...")

# 1. Clean Data for Linear (Needs Polynomials!)
def clean_data_linear(df, is_train=True):
    df_clean = df.copy()
    
    # Standard Mappings
    quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
    ord_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
    for col in ord_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('None').map(quality_map).fillna(0)

    # Fill NAs
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    cols_to_zero = [c for c in numeric_cols if c not in ['Id', 'SalePrice']]
    df_clean[cols_to_zero] = df_clean[cols_to_zero].fillna(0)
    
    # Feature Engineering (Polynomials)
    if 'YearBuilt' in df_clean.columns:
        df_clean['HouseAge'] = 2010 - df_clean['YearBuilt']
    
    df_clean['TotalSF'] = (df_clean['TotalBsmtSF'] + df_clean['1stFlrSF'] + df_clean['2ndFlrSF'])
    
    # Important Polynomials for Linear
    if 'OverallQual' in df_clean.columns:
         df_clean['OverallQual_Sq'] = df_clean['OverallQual'] ** 2
         df_clean['TotalSF_Sq'] = df_clean['TotalSF'] ** 2
         df_clean['OverallQual_Cu'] = df_clean['OverallQual'] ** 3
         if 'HouseAge' in df_clean.columns:
             df_clean['Qual_Age'] = df_clean['OverallQual'] * (1 / (df_clean['HouseAge'] + 1))
             
    if is_train and 'GrLivArea' in df_clean.columns:
        df_clean = df_clean[df_clean['GrLivArea'] < 4000]

    return df_clean

# 2. Prepare & Train
train_lin = clean_data_linear(train, is_train=True)
test_lin = clean_data_linear(test, is_train=False)

train_lin = pd.get_dummies(train_lin)
test_lin = pd.get_dummies(test_lin)
train_lin, test_lin = train_lin.align(test_lin, join='left', axis=1)
test_lin = test_lin.fillna(0)

X_lin = train_lin.drop(['Id', 'SalePrice'], axis=1)
y_lin = np.log1p(train_lin['SalePrice'])
X_test_lin = test_lin.drop(['Id', 'SalePrice'], axis=1)

scaler = RobustScaler()
X_scaled_lin = scaler.fit_transform(X_lin)
X_test_scaled_lin = scaler.transform(X_test_lin)

elastic = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9, 1], cv=5, max_iter=50000, n_jobs=-1)
elastic.fit(X_scaled_lin, y_lin)

# 3. Get Predictions (Linear)
linear_preds = np.expm1(elastic.predict(X_test_scaled_lin))
print("âœ… Linear Model Trained!")


# ==========================================
# PART 3: BLEND & SUBMIT
# ==========================================
print("ðŸ¥£ Blending Models...")

# Weighted Average: 60% Linear + 40% XGBoost
final_blend = (0.60 * linear_preds) + (0.40 * xgb_preds)

submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': final_blend})
submission.to_csv('submission_super_blend.csv', index=False)

print("ðŸš€ Success! submission_super_blend.csv created (No missing files!)")

ðŸŒ² Training XGBoost...
âœ… XGBoost Trained!
ðŸ“ˆ Training Linear Model...
âœ… Linear Model Trained!
ðŸ¥£ Blending Models...
ðŸš€ Success! submission_super_blend.csv created (No missing files!)


In [39]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# --- 1. REFRESH DATA (Crucial Step!) ---
# We force Python to use the NEW cleaning function you just wrote
# to create fresh X and y variables.
train_fresh = clean_data_linear(train, is_train=True) # <--- Using your latest function
train_fresh = pd.get_dummies(train_fresh)
train_fresh = train_fresh.fillna(0)

X_fresh = train_fresh.drop(['Id', 'SalePrice'], axis=1)
y_fresh = np.log1p(train_fresh['SalePrice'])

# --- 2. SETUP SIMULATION ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = []

X_vals = X_fresh.values 
y_vals_log = y_fresh.values

print("Calculating MAE on FRESH Data...")

for train_idx, val_idx in kf.split(X_vals):
    # A. Split
    X_t, X_v = X_vals[train_idx], X_vals[val_idx]
    y_t_log = y_vals_log[train_idx]
    y_v_real = np.expm1(y_vals_log[val_idx])
    
    # B. Scale
    scaler = StandardScaler()
    X_t_scaled = scaler.fit_transform(X_t)
    X_v_scaled = scaler.transform(X_v)
    
    # C. Train (Ridge)
    model = Ridge(alpha=20)
    model.fit(X_t_scaled, y_t_log)
    
    # D. Predict
    pred_log = model.predict(X_v_scaled)
    pred_dollar = np.expm1(pred_log)
    
    # E. Score
    score = mean_absolute_error(y_v_real, pred_dollar)
    mae_scores.append(score)

print(f"Average MAE: ${np.mean(mae_scores):,.2f}")

Calculating MAE on FRESH Data...
Average MAE: $14,807.59
