In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv') # Load the file containing 'SalePrice' into train DataFrame
test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')   # Load the file without 'SalePrice' into test DataFrame
sample_submission = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')

# Explore data
print(train.head())
print(train.info())

# Handle missing values
# For numerical features
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns
for feature in numerical_features:
    if train[feature].isnull().sum() > 0:
        train[feature].fillna(train[feature].mean(), inplace=True)
        test[feature].fillna(train[feature].mean(), inplace=True)

# For categorical features
categorical_features = train.select_dtypes(include=['object']).columns
for feature in categorical_features:
    if train[feature].isnull().sum() > 0:
        train[feature].fillna('Missing', inplace=True)
        test[feature].fillna('Missing', inplace=True)

# Encode categorical variables
train = pd.get_dummies(train, columns=categorical_features, drop_first=True)
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

# Store the 'Id' column before dropping it (Place it here)
test_id = test['Id']  # Store 'Id' column for later use

# Align columns between train and test
# Get missing columns in test
missing_cols_test = set(train.columns) - set(test.columns)
# Add missing columns to test with value 0
for c in missing_cols_test:
    test[c] = 0
# Get missing columns in train
missing_cols_train = set(test.columns) - set(train.columns)
# Add missing columns to train with value 0
for c in missing_cols_train:
    train[c] = 0

# Ensure test set has the same columns as the training set in the same order
train_columns = train.columns
# Remove 'SalePrice' and 'Id' from train_columns as they're not in test data for prediction
train_columns = train_columns.drop(['SalePrice', 'Id'])  
test = test[train_columns]  # Reorder test columns to match train_columns

# Check for NaN values in the test set after column alignment
print(test.isnull().sum().sum())  # Print the total number of NaN values in the test set

# Impute remaining NaN values in the test set (if any)
for feature in test.columns:
    if test[feature].isnull().sum() > 0:
        # Impute numerical features with the mean (or another appropriate strategy)
        if test[feature].dtype in ['int64', 'float64']:
            test[feature].fillna(test[feature].mean(), inplace=True)  
        # Impute categorical features with 'Missing' (or another appropriate strategy)
        else:
            test[feature].fillna('Missing', inplace=True) 

# Feature engineering
# Example: Create a new feature 'TotalSF'
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Log transformation of the target variable
train['SalePrice'] = np.log1p(train['SalePrice'])

# Split data
X = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
print(f'Validation RMSE: {rmse}')

# Train on the full training set
model.fit(X, y)

# Make predictions on the test set
test_predictions = model.predict(test) # Remove .drop('Id', axis=1) since 'Id' is already removed
test_predictions = np.expm1(test_predictions)  # Exponentiate to get back to original scale

# Create submission file using the stored 'Id' column
submission = pd.DataFrame({'Id': test_id, 'SalePrice': test_predictions})  # Use stored test_id
submission.to_csv('submission.csv', index=False)



   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   