![Rising House Prices](https://www.mexperience.com/wp-content/uploads/House-Value-Graph-NBS-750x375.jpg)

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

## Explore Data 


In [None]:
df_train.head(10)

In [None]:
df_test.head(10)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_test.info()

## Claen Train &Test Data 

In [None]:
cat_over_columns = ['Street', 'Alley', 'LandContour','Utilities','Condition2',
                    'RoofMatl','BsmtCond','Heating','Electrical','Functional','GarageQual','GarageCond','PavedDrive']
df_train.drop(columns=cat_over_columns, inplace=True)

In [None]:
too_much_na_columns = ['PoolQC', 'Fence', 'MiscFeature']
df_train.drop(columns=too_much_na_columns, inplace=True)

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
medians = df_train.select_dtypes(include='number').median()

In [None]:
df_train.fillna(value=medians,inplace=True)

In [None]:
modes = df_train.select_dtypes(include='object').mode().iloc[0]
df_train.fillna(value=modes,inplace=True)

In [None]:
df_train.info()

In [None]:
cat_over_columns1 = ['Street', 'Alley', 'LandContour','Utilities','Condition2',
                    'RoofMatl','BsmtCond','Heating','Electrical','Functional','GarageQual','GarageCond','PavedDrive']
df_test.drop(columns=cat_over_columns, inplace=True)

In [None]:
too_much_na_columns1 = ['PoolQC', 'Fence', 'MiscFeature']
df_test.drop(columns=too_much_na_columns, inplace=True)

In [None]:
df_test.head()

In [None]:
medians1 = df_test.select_dtypes(include='number').median()
df_test.fillna(value=medians1,inplace=True)

In [None]:
modes1 = df_test.select_dtypes(include='object').mode().iloc[0]
df_test.fillna(value=modes1,inplace=True)

In [None]:
df_test.head()

In [None]:
relevant_columns = ['Id',
    'OverallQual', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'GarageCars', 'GarageArea', 'YearBuilt', 'YearRemodAdd', 'Neighborhood',
    'Condition1', 'LotArea', 'ExterQual', 'BsmtQual', 'KitchenQual',
    'HeatingQC', 'Fireplaces', 'TotRmsAbvGrd', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'SaleType', 'SaleCondition', 'LotFrontage',
    'MasVnrArea', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MoSold', 'YrSold','SalePrice'
]
df_train = df_train[relevant_columns]

In [None]:
df_train.info()

## Data Analysis and Visualization

In [None]:
df_train.isnull().sum()

In [None]:
correlation_matrix = df_train.corr(numeric_only=True)
correlation_matrix

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix[['SalePrice']], annot=True, cmap='coolwarm')
plt.title('Correlation with SalePrice')
plt.show()

In [None]:
top_features = correlation_matrix['SalePrice'].abs().nlargest(5).index
for feature in top_features:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=df_train, x=feature, y='SalePrice')
    plt.title(f'SalePrice vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df_train['SalePrice'], bins=30, kde=True)
plt.title('Distribution of SalePrice')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

## Feature Engineering

In [None]:
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

### Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [None]:
X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_features = ['Neighborhood', 'Condition1', 'ExterQual', 'BsmtQual', 'KitchenQual', 'HeatingQC', 'SaleType', 'SaleCondition', 'RoofStyle', 'Exterior1st', 'Exterior2nd']
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features = [feat for feat in numerical_features if feat not in categorical_features]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

## LinearRegression Model

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])


In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_val)

In [None]:
mse = mean_squared_error(y_val, y_pred)
print(f'Linear Regression Validation MSE: {mse}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


## RandomForest Regression Model

In [None]:
rf_model = RandomForestRegressor()

In [None]:
rf_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

In [None]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

In [None]:
rf_search = RandomizedSearchCV(rf_pipeline, rf_param_grid, n_iter=10, cv=3, verbose=1, random_state=42)
rf_search.fit(X_train, y_train)

In [None]:
print(f'Best parameters for Random Forest: {rf_search.best_params_}')

In [None]:
y_pred_rf = rf_search.predict(X_val)

In [None]:
print(f'Random Forest Validation MSE: {mean_squared_error(y_val, y_pred_rf)}')

## Fine Tuning 


In [None]:
final_rf_model = RandomForestRegressor(
    n_estimators=100,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=10,
    random_state=42
)

In [None]:
final_rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', final_rf_model)
])

In [None]:
final_rf_pipeline.fit(X_train, y_train)

In [None]:
y_pred_rf = final_rf_pipeline.predict(X_val)
mse_rf = mean_squared_error(y_val, y_pred_rf)
print(f'Final Random Forest Validation MSE: {mse_rf}')

## XGBoost Model

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model = XGBRegressor()

In [None]:
xgb_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 6, 9],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.8, 0.9, 1.0]
}

In [None]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

In [None]:
xgb_search = RandomizedSearchCV(xgb_pipeline, xgb_param_grid, n_iter=10, cv=3, verbose=1, random_state=42)

In [None]:
xgb_search.fit(X_train, y_train)

In [None]:
print(f'Best parameters for XGBoost: {xgb_search.best_params_}')

In [None]:
y_pred_xgb = xgb_search.predict(X_val)

In [None]:
print(f'XGBoost Validation MSE: {mean_squared_error(y_val, y_pred_xgb)}')

## Lasso Model

In [None]:
from sklearn.linear_model import Lasso


In [None]:
lasso_model = Lasso(alpha=0.1)

In [None]:
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lasso_model)
])

In [None]:
lasso_pipeline.fit(X_train, y_train)

In [None]:
y_pred_lasso = lasso_pipeline.predict(X_val)
mse_lasso = mean_squared_error(y_val, y_pred_lasso, squared=False)  
print(f'Lasso Regression Validation RMSE: {mse_lasso}')

In [None]:
df_train.columns

In [None]:
df_test = df_test[relevant_columns[:-1]]

In [None]:
df_test = df_test[relevant_columns[:-1]]  # Exclude 'SalePrice'

# Check columns again
print("Columns in df_test after alignment:")
print(df_test.columns)

In [None]:
X_test_transformed = preprocessor.transform(df_test)
X_test_transformed

In [None]:
y_test_pred = pipeline.predict(df_test)

In [None]:
y_test_pred_original_scale = np.expm1(y_test_pred)

In [None]:
submission_df = pd.DataFrame({
    'Id': df_test['Id'],
    'SalePrice': y_test_pred_original_scale
})


In [None]:
submission_df.to_csv('submission.csv', index=False)

1-**Linear Regression Validation MSE: 0.0231**

2-**Random Forest Validation MSE: 0.0305**

3-**Final Random Forest Validation MSE: 0.0309**

4-**XGBoost Validation MSE: 0.0264**

5-**Lasso Regression Validation RMSE: 0.2407**
