In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv").head

In [None]:
import pandas as pd
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
print(train.columns)

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Load data
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# Save IDs and target
train_ids = train['Id']
test_ids = test['Id']
target = np.log1p(train['SalePrice'])  # Log transformation

# Remove IDs and target from features
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# Combine data for preprocessing
all_data = pd.concat([train, test], axis=0)

# ... (previous imports and data loading remain the same)

# Handle missing values - CORRECTED VERSION
missing_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

# Corrected missing value handling
for col in missing_none:
    all_data[col] = all_data[col].fillna('None')  # Assign back instead of inplace

# Fill with mode for other categoricals (corrected)
cat_cols = all_data.select_dtypes(include='object').columns
for col in cat_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode().iloc[0])  # Use iloc[0] for clarity

# Fill numerical missing values (corrected)
num_cols = all_data.select_dtypes(exclude='object').columns
for col in num_cols:
    all_data[col] = all_data[col].fillna(all_data[col].median())

# ... (rest of the code remains the same)

# Feature engineering
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = (all_data['FullBath'] + 0.5*all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5*all_data['BsmtHalfBath'])
all_data['Age'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']

# Handle skewed features
skewed_feats = all_data[num_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.75]
for feat in high_skew.index:
    all_data[feat] = np.log1p(all_data[feat])

# Encode categorical variables
ordinal_mapping = {
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtQual': {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0},
    'KitchenQual': {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1},
    # Add other ordinal mappings
}

for col, mapping in ordinal_mapping.items():
    all_data[col] = all_data[col].map(mapping)

# One-hot encode remaining categoricals
all_data = pd.get_dummies(all_data)

# Split back into train and test
X_train = all_data.iloc[:len(train)]
X_test = all_data.iloc[len(train):]

# Outlier removal (example for GrLivArea)
outliers = X_train[(X_train['GrLivArea'] > 4000) & (target < np.log1p(300000))].index
X_train = X_train.drop(outliers)
target = target.drop(outliers)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# XGBoost model
xgb_preds = np.zeros(X_test.shape[0])
xgb_rmse = []

for train_idx, val_idx in kf.split(X_train):
    X_train_kf, X_val_kf = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_kf, y_val_kf = target.iloc[train_idx], target.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train_kf, y_train_kf)
    dval = xgb.DMatrix(X_val_kf, y_val_kf)
    dtest = xgb.DMatrix(X_test)
    
    params = {
        'eta': 0.01,
        'max_depth': 4,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'seed': 42
    }
    
    model = xgb.train(params, dtrain, num_boost_round=10000,
                      evals=[(dtrain, 'train'), (dval, 'val')],
                      early_stopping_rounds=100, verbose_eval=False)
    
    xgb_preds += np.expm1(model.predict(dtest)) / kf.n_splits
    xgb_rmse.append(model.best_score)

# Corrected LightGBM training code
lgb_preds = np.zeros(X_test.shape[0])
lgb_rmse = []

for train_idx, val_idx in kf.split(X_train):
    X_train_kf, X_val_kf = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_kf, y_val_kf = target.iloc[train_idx], target.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_kf, label=y_train_kf)
    val_data = lgb.Dataset(X_val_kf, label=y_val_kf, reference=train_data)
    
    params = {
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': -1,
        'min_child_samples': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'metric': 'rmse',
        'random_state': 42,
        'early_stopping_round': 100,
        'verbosity': -1  # Suppresses LightGBM output
    }
    
    # Corrected training call
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],  # Only validation set needed
        num_boost_round=10000,
        callbacks=[lgb.log_evaluation(period=0)]  # Suppresses output
    )
    
    lgb_preds += np.expm1(model.predict(X_test, num_iteration=model.best_iteration)) / kf.n_splits
    lgb_rmse.append(model.best_score['valid_0']['rmse'])  # Changed to 'valid_0'

# Ensemble predictions
final_preds = 0.5 * xgb_preds + 0.5 * lgb_preds

# Create submission
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_preds})
submission.to_csv('submission.csv', index=False)