# Task 3

## 1. Data loading nad Initial Exploration

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV


In [26]:
df = pd.read_csv(r"C:\Users\patro\Downloads\House Price Prediction - Task3\Housing.csv")
print(df)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

In [28]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   price                            545 non-null    int64
 1   area                             545 non-null    int64
 2   bedrooms                         545 non-null    int64
 3   bathrooms                        545 non-null    int64
 4   stories                          545 non-null    int64
 5   mainroad                         545 non-null    int64
 6   guestroom                        545 non-null    int64
 7   basement                         545 non-null    int64
 8   hotwaterheating                  545 non-null    int64
 9   airconditioning                  545 non-null    int64
 10  parking                          545 non-null    int64
 11  prefarea                         545 non-null    int64
 12  furnishingstatus_semi-furnished  545 non-null    b

## 2. Data preprocessing

In [30]:
# Convert yes/no columns to binary (1/0)
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                 'airconditioning', 'prefarea']
df[binary_columns] = df[binary_columns].apply(lambda x: x.map({'yes': 1, 'no': 0}))

# One-hot encode furnishingstatus
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

# Separate features and target
X = df.drop('price', axis=1)
y = df['price']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipeline
numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = []  # Already handled the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Add preprocessing to models
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                        ('regressor', LinearRegression())]),
    'Ridge Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                       ('regressor', Ridge())]),
    'Lasso Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                       ('regressor', Lasso())]),
    'Decision Tree': Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', DecisionTreeRegressor(random_state=42))]),
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', RandomForestRegressor(random_state=42))]),
    'Gradient Boosting': Pipeline(steps=[('preprocessor', preprocessor),
                                        ('regressor', GradientBoostingRegressor(random_state=42))]),
    'Support Vector': Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', SVR())]),
    'K-Neighbors': Pipeline(steps=[('preprocessor', preprocessor),
                                  ('regressor', KNeighborsRegressor())])
}

KeyError: "None of [Index(['furnishingstatus'], dtype='object')] are in the [columns]"

## 3. Model Training and Evaluation

In [31]:
results = {}

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
    
    # Print results
    print(f"{name}:")
    print(f"  MSE: {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R2: {r2:.4f}")
    print("----------")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by='R2', ascending=False))

Linear Regression:
  MSE: 2292721545725.37
  RMSE: 1514173.55
  MAE: 1127483.35
  R2: 0.5464
----------
Ridge Regression:
  MSE: 2293867305607.65
  RMSE: 1514551.85
  MAE: 1127710.02
  R2: 0.5462
----------
Lasso Regression:
  MSE: 2292722393161.34
  RMSE: 1514173.83
  MAE: 1127483.49
  R2: 0.5464
----------
Decision Tree:
  MSE: 4445642483998.37
  RMSE: 2108469.23
  MAE: 1445408.81
  R2: 0.1205
----------
Random Forest:
  MSE: 2572002873789.32
  RMSE: 1603746.51
  MAE: 1143365.71
  R2: 0.4912
----------
Gradient Boosting:
  MSE: 2396960086090.83
  RMSE: 1548211.90
  MAE: 1122130.09
  R2: 0.5258
----------
Support Vector:
  MSE: 5567921966485.06
  RMSE: 2359644.46
  MAE: 1763886.64
  R2: -0.1016
----------
K-Neighbors:
  MSE: 2690069586534.35
  RMSE: 1640143.16
  MAE: 1178506.26
  R2: 0.4678
----------
                            MSE          RMSE           MAE        R2
Linear Regression  2.292722e+12  1.514174e+06  1.127483e+06  0.546406
Lasso Regression   2.292722e+12  1.514174e+06 

## 4. Hyperparameter Tuning for Top Models

In [32]:
# Tune Random Forest
rf_param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

rf_grid = GridSearchCV(models['Random Forest'], rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Tune Gradient Boosting
gb_param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

gb_grid = GridSearchCV(models['Gradient Boosting'], gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_grid.fit(X_train, y_train)
best_gb = gb_grid.best_estimator_

# Evaluate tuned models
tuned_models = {
    'Tuned Random Forest': best_rf,
    'Tuned Gradient Boosting': best_gb
}

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name}:")
    print(f"  MSE: {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R2: {r2:.4f}")
    print("----------")

Tuned Random Forest:
  MSE: 2462244528644.53
  RMSE: 1569154.08
  MAE: 1128484.02
  R2: 0.5129
----------
Tuned Gradient Boosting:
  MSE: 2623446492447.14
  RMSE: 1619705.68
  MAE: 1204721.39
  R2: 0.4810
----------


## 5. Feature Importance Analysis

In [33]:
# Get feature importances from the best model
best_model = best_gb  # or best_rf if it performed better
feature_importances = best_model.named_steps['regressor'].feature_importances_

# Get feature names
feature_names = numeric_features + list(X.columns[len(numeric_features):])

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.tight_layout()
plt.show()

ValueError: All arrays must be of the same length