<a href="https://colab.research.google.com/github/Oelebrashy/Machine-Learning/blob/main/Housing_Prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
train_path = '/content/drive/My Drive/House Prices-Advanced Regression Techniques/train.csv'
test_path = '/content/drive/My Drive/House Prices-Advanced Regression Techniques/test.csv'
data_description_path = '/content/drive/My Drive/House Prices-Advanced Regression Techniques/data_description.txt'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Preserve the target variable 'SalePrice' and the 'Id' column before encoding
train_id = train['Id']
train_target = train['SalePrice']
test_id = test['Id']

# Drop 'Id' and 'SalePrice' before handling missing values and encoding
train = train.drop(['Id', 'SalePrice'], axis=1)
test = test.drop(['Id'], axis=1)

In [None]:
# Display the first few rows of the training dataset
print("Training Data:")
print(train.head())

print("\nTest Data:")
print(test.head())

In [None]:
# Check for missing values
print("\nMissing values in training set:")
print(train.isnull().sum()[train.isnull().sum() > 0])

print("\nMissing values in test set:")
print(test.isnull().sum()[test.isnull().sum() > 0])

In [None]:
# Data description
with open(data_description_path, 'r') as file:
    data_description = file.read()

print("\nData Description:")
print(data_description)

In [None]:
# Initial data visualization
plt.figure(figsize=(10, 6))
sns.histplot(train['SalePrice'], kde=True)
plt.title('Distribution of Sale Prices')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Display the data types of each column
print("\nData Types in Training Set:")
print(train.dtypes)


In [None]:
# Fill missing values for categorical features
train['Alley'].fillna('None', inplace=True)
train['MasVnrType'].fillna('None', inplace=True)
train['BsmtQual'].fillna('NA', inplace=True)
train['BsmtCond'].fillna('NA', inplace=True)
train['BsmtExposure'].fillna('NA', inplace=True)
train['BsmtFinType1'].fillna('NA', inplace=True)
train['BsmtFinType2'].fillna('NA', inplace=True)
train['Electrical'].fillna(train['Electrical'].mode()[0], inplace=True)
train['FireplaceQu'].fillna('NA', inplace=True)
train['GarageType'].fillna('NA', inplace=True)
train['GarageFinish'].fillna('NA', inplace=True)
train['GarageQual'].fillna('NA', inplace=True)
train['GarageCond'].fillna('NA', inplace=True)
train['PoolQC'].fillna('NA', inplace=True)
train['Fence'].fillna('NA', inplace=True)
train['MiscFeature'].fillna('None', inplace=True)

# Fill missing values for numerical features
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
train['MasVnrArea'].fillna(0, inplace=True)
train['GarageYrBlt'].fillna(train['YearBuilt'], inplace=True)
train['BsmtFinSF1'].fillna(0, inplace=True)
train['BsmtFinSF2'].fillna(0, inplace=True)
train['BsmtUnfSF'].fillna(0, inplace=True)
train['TotalBsmtSF'].fillna(0, inplace=True)
train['BsmtFullBath'].fillna(0, inplace=True)
train['BsmtHalfBath'].fillna(0, inplace=True)
train['GarageCars'].fillna(0, inplace=True)
train['GarageArea'].fillna(0, inplace=True)

# Repeat the same steps for the test set
test['Alley'].fillna('None', inplace=True)
test['MasVnrType'].fillna('None', inplace=True)
test['BsmtQual'].fillna('NA', inplace=True)
test['BsmtCond'].fillna('NA', inplace=True)
test['BsmtExposure'].fillna('NA', inplace=True)
test['BsmtFinType1'].fillna('NA', inplace=True)
test['BsmtFinType2'].fillna('NA', inplace=True)
test['Electrical'].fillna(test['Electrical'].mode()[0], inplace=True)
test['FireplaceQu'].fillna('NA', inplace=True)
test['GarageType'].fillna('NA', inplace=True)
test['GarageFinish'].fillna('NA', inplace=True)
test['GarageQual'].fillna('NA', inplace=True)
test['GarageCond'].fillna('NA', inplace=True)
test['PoolQC'].fillna('NA', inplace=True)
test['Fence'].fillna('NA', inplace=True)
test['MiscFeature'].fillna('None', inplace=True)

test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test['MasVnrArea'].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(test['YearBuilt'], inplace=True)
test['BsmtFinSF1'].fillna(0, inplace=True)
test['BsmtFinSF2'].fillna(0, inplace=True)
test['BsmtUnfSF'].fillna(0, inplace=True)
test['TotalBsmtSF'].fillna(0, inplace=True)
test['BsmtFullBath'].fillna(0, inplace=True)
test['BsmtHalfBath'].fillna(0, inplace=True)
test['GarageCars'].fillna(0, inplace=True)
test['GarageArea'].fillna(0, inplace=True)


In [None]:
train_encoded = pd.get_dummies(train)
test_encoded = pd.get_dummies(test)

# Align the train and test datasets to have the same columns
train_encoded, test_encoded = train_encoded.align(test_encoded, join='inner', axis=1)

# Add back the target variable 'SalePrice' to the encoded training set
train_encoded['SalePrice'] = train_target

# Calculate the correlation matrix
corr_matrix = train_encoded.corr()

# Sort the correlation values with SalePrice
sorted_corr = corr_matrix['SalePrice'].sort_values(ascending=False)
print(sorted_corr)

In [None]:
# Selecting top features based on correlation with SalePrice
top_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF',
                'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']

X = train_encoded[top_features]
y = train_encoded['SalePrice']

# Ensure the test set has the same selected features
X_test = test_encoded[top_features]


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Train a Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predict on the validation set
y_pred = lin_reg.predict(X_val)

# Calculate RMSE
rmse_lin = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE (Linear Regression): {rmse_lin}')


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred_rf = rf.predict(X_val)

# Calculate RMSE
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
print(f'RMSE (Random Forest): {rmse_rf}')


In [None]:
import xgboost as xgb

# Train a Gradient Boosting model
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xg_reg.fit(X_train, y_train)

# Predict on the validation set
y_pred_xg = xg_reg.predict(X_val)

# Calculate RMSE
rmse_xg = np.sqrt(mean_squared_error(y_val, y_pred_xg))
print(f'RMSE (XGBoost): {rmse_xg}')


In [None]:
import xgboost as xgb

# Train a Gradient Boosting model
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xg_reg.fit(X_train, y_train)

# Predict on the validation set
y_pred_xg = xg_reg.predict(X_val)

# Calculate RMSE
rmse_xg = np.sqrt(mean_squared_error(y_val, y_pred_xg))
print(f'RMSE (XGBoost): {rmse_xg}')


In [None]:
from sklearn.model_selection import GridSearchCV
#Hyperparamter Tuning for Gradient Boosting using Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'), param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the validation set
y_pred_best = best_model.predict(X_val)

# Calculate RMSE
rmse_best = np.sqrt(mean_squared_error(y_val, y_pred_best))
print(f'Best RMSE: {rmse_best}')


In [None]:
# Predict on the test dataset using the best model
final_predictions = best_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_id,
    'SalePrice': final_predictions
})

submission.to_csv('submission.csv', index=False)
