In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
# Load dataset
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")


In [4]:
# Overview data
print(train.info())
print(train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Handle missing values
missing_values = train.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
print("Missing values:", missing_values)

Missing values: PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


In [6]:
# Splitting data
if 'Id' in train.columns:
    X = train.drop(['Id', 'SalePrice'], axis=1)
else:
    X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Fill missing values with median for numerical and mode for categorical features

num_features = [col for col in train.select_dtypes(include=['int64', 'float64']).columns if col in X_train.columns]
cat_features = [col for col in train.select_dtypes(include=['object']).columns if col in X_train.columns]

In [8]:

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [9]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [11]:
# Model Selection
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}


In [12]:
# Check if model can fit data without errors
test_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

test_model.fit(X_train, y_train)
test_preds = test_model.predict(X_valid)
print("Test RMSE:", mean_squared_error(y_valid, test_preds, squared=False))


Test RMSE: 29473.802258967033


In [13]:
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    print(f"{name} RMSE: {np.mean(-scores):.4f}")


Linear Regression RMSE: 37570.3498
Random Forest RMSE: 30685.2091


In [14]:
# Final Model
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
final_model.fit(X_train, y_train)

In [18]:
# Predictions
preds = final_model.predict(X_valid)
rmse = mean_squared_error(y_valid, preds, squared=False)
print(f"Final Model RMSE: {rmse:.4f}")

Final Model RMSE: 28432.1119


In [19]:
if 'Id' in test.columns:
    test_ids = test['Id']
    test.drop(['Id'], axis=1, inplace=True)
else:
    test_ids = None

In [20]:
# Ensure test has the same features as train
missing_cols = set(X_train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0

In [21]:
extra_cols = set(test.columns) - set(X_train.columns)
test.drop(columns=extra_cols, inplace=True)

In [22]:
# Ensure test has the same features as train
missing_cols = set(X_train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0
else:
    test_ids = None
if 'Id' in test.columns:
    test_ids = test['Id']
    test.drop(['Id'], axis=1, inplace=True)
else:
    test_ids = None

In [23]:
# Ensure test has the same features as train
missing_cols = set(X_train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0

extra_cols = set(test.columns) - set(X_train.columns)
test.drop(columns=extra_cols, inplace=True)

In [24]:
# Ensure test has the same features as train
missing_cols = set(X_train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0
test_preds = final_model.predict(test)
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_preds})
submission.to_csv('submission.csv', index=False)
print("Submission file created!")


Submission file created!
