In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Dataset/HousingData.csv")

In [None]:
df.rename(columns={'MEDV':'Price'}, inplace=True)
df.head()

In [None]:
df.tail()

In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.info()

In [None]:
df.isna().sum()

## Handling Null Values

The dataset contains missing (null) values. To handle them, we have two main approaches:  

1. **Drop Rows**: Remove rows containing null values.  
2. **Impute Missing Values**: Replace null values with appropriate statistical measures:  
   - **Numerical Columns**: Use **mean** or **median** for replacement.  
   - **Categorical Columns**: Use **mode** (most frequent value) for replacement.  

Using these strategies ensures that the dataset remains clean and useful for analysis and modeling.

In [None]:
for col in df.columns:
    if df[col].dtype != 'object':  # If column is numeric
        df[col] = df[col].fillna(df[col].median())  # Fill with median


df.isna().sum()


In [None]:
df.duplicated().sum()

In [None]:

np.isinf(df).sum()


In [None]:
numeric_cols=df.select_dtypes(include=np.number).columns
categorical_cols=df.select_dtypes(include='object').columns

print("Number of mumeric cols: ",len(numeric_cols))
print("Number of categorical cols: ",len(categorical_cols))

In [None]:
col_name=[]
n_unique=[]
unique_value=[]
col_types=[]
for col in df.columns:
    col_name.append(col)
    n_unique.append(len(df[col].unique()))
    unique_value.append(df[col].unique())
    col_types.append(df[col].dtype)

check_dic={"col_name":col_name,"no of unique vlaues":n_unique,"unique_value":unique_value,"col_types":col_types}
check_df=pd.DataFrame(check_dic)
check_df


#Data visualization
## Exploring Categorical Variables

In this dataset, we can consider both **CHAS** and **RAD** as categorical variables:  

- **CHAS**: A binary variable indicating whether the property is next to the Charles River (**1 = Yes, 0 = No**).  
- **RAD**: An index representing accessibility to radial highways, which can be treated as a categorical feature due to its discrete nature.  

Properly handling these categorical variables can improve model performance in predictive tasks.

In [None]:
cols=['CHAS','RAD']
for col in cols:
    plt.figure(figsize=(10,4))
    sns.countplot(x=col,data=df)
    plt.title(f'Bar Chart of {col}')
    plt.grid(True)
    plt.show()

In [None]:
# Explore Numerical Values
df.hist(figsize=(20,25))
plt.title('Histograms of features')
plt.show()

## Handling Skewed Data

The dataset exhibits some skewness, which can negatively impact model performance. To address this, we can apply various transformation techniques:  

### **Techniques to Handle Skewness**
- **Log Transformation**: Useful for right-skewed distributions (e.g., `np.log1p(column)`).  
- **Square Root Transformation**: Reduces skewness while preserving relationships.

Applying the appropriate transformation can help normalize the data and improve model performance.


In [None]:
rows = len(numeric_cols) // 2 if len(numeric_cols) % 2 == 0 else (len(numeric_cols.columns) // 2) + 1

plt.figure(figsize=(10, rows * 4))

for i, col in enumerate(numeric_cols):
    plt.subplot(rows, 2, i + 1)
    sns.boxplot(y=df[col])
    plt.ylabel(col)
    plt.title(f"Boxplot of column  {col}")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,10))
corr=df.corr()
sns.heatmap(corr,annot=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
corr.abs()

In [None]:
for key , value in df.items():
    qua1 = value.quantile(0.25)
    qua3 = value.quantile(0.75)
    iqr = qua3 - qua1
    value_col = value[(value <= qua1 - 1.5 * iqr) | (value >= qua3 + 1.5 * iqr)]
    percentage = np.shape(value_col)[0] * 100.0 / np.shape(df)[0]
    print("Column %s outliers = %.2f%%" % (key, percentage))

## Handling Outliers in the Boston Housing Dataset  

During our analysis, we discovered that **a significant portion of the dataset (~283 rows out of 506)** were identified as outliers using the **Interquartile Range (IQR) method**.  
Since removing such a large portion of data could lead to **data loss and reduced model performance**, we opted for a more robust approach.  

### **Our Approach to Handling Outliers:**  
1. **Log Transformation**:  
   - Many numerical features, such as `CRIM`, `LSTAT`, and `B`, exhibit **high skewness**.  
   - We apply a **log transformation** (`log(1 + x)`) to reduce the impact of extreme values.  

2. **Robust Scaling**:  
   - Instead of using standard normalization methods (like MinMax or StandardScaler), we apply **RobustScaler**, which scales data using the **median and IQR**.  
   - This makes our dataset **more stable** for machine learning models by reducing the influence of outliers.  

### **Why This Works?**  

*   Preserves valuable data instead of removing outliers.  

*  Makes the distribution more **normal-like**, improving model performance.

*    Reduces the impact of extreme values, ensuring robust feature scaling.






In [None]:
# Define skewed columns
skewed_cols = ["AGE", "DIS", "CRIM", "LSTAT", "B"]

# Apply log1p transformation for better stability
for col in skewed_cols:
    df[col] = np.log(df[col]+1e-10)

# Plot histograms after log transformation
df.hist(figsize=(18, 10))
plt.suptitle("Histogram of Skewed Columns After Log Transformation", fontsize=16)
plt.show()

In [None]:
df_univariate= df[['RM' ,'Price']]
sns.pairplot(df_univariate)

In [None]:
df_displot = df[['INDUS','NOX', 'RM' ,'PTRATIO','LSTAT','Price']]
sns.pairplot(df_displot)

In [None]:
numerical_cols = [col for col in df.columns if col not in ['CHAS', 'RAD','Price']]
categorical_cols = ['CHAS', 'RAD']


We apply **multiple sequential transformations** (such as imputation, log transformation, and scaling) to numerical features. A `Pipeline` is ideal for this because it ensures that each step is executed **in order**, preventing data leakage and maintaining consistency.  

For categorical features, we apply **different transformations to different categorical columns** (e.g., One-Hot Encoding for `CHAS` and Ordinal Encoding for `RAD`). To handle this efficiently, we use a `ColumnTransformer`, which allows us to apply **specific transformations to specific groups of columns** within a single step.

In [None]:
# Numerical pipeline
numerical_pipline = Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",RobustScaler())
])

# Categorical pipeline
categorical_pipeline = ColumnTransformer([
    ("chas",OneHotEncoder(handle_unknown="ignore",drop='first'),['CHAS']),
    ('rad',OrdinalEncoder(),['RAD'])
])

# Combine both pipelines
preprocessor = ColumnTransformer([
    ("cat",categorical_pipeline,categorical_cols),
    ("num",numerical_pipline,numerical_cols)
])

In [None]:
df.columns

In [None]:
# Spliting target variable and independent variables
X=df.drop(['Price'],axis=1)
y=df["Price"]
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
feature_names = X_train.columns
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Initialize results dictionary for regression
results = {
    "Model": [],
    "MSE": [],
    "RMSE": [],
    "MAE": [],
    "R2 Score": [],
    "CV R2 Score": []
}

# Function to append regression results
def append_results_regression(model_name, mse, rmse, mae, r2, cv_r2):
    results["Model"].append(model_name)
    results["MSE"].append(mse)
    results["RMSE"].append(rmse)
    results["MAE"].append(mae)
    results["R2 Score"].append(r2)
    results["CV R2 Score"].append(cv_r2)

In [None]:
def model_evaluation_regression(y_pred, y_test, model, X_train, y_train):
    # Compute evaluation metrics for regression
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Check if model is an sklearn model (i.e., has get_params())
    if hasattr(model, "get_params"):
        cross_score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    else:
        cross_score = None  # Cross-validation is not applicable for neural networks

    # Print evaluation metrics
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    # Only print if cross-score is available
    if cross_score is not None:
        print(f"Cross-Validation R2 Score: {cross_score:.4f}")

    # Optionally, display predictions and actual values side-by-side
    display(pd.DataFrame(np.c_[y_pred, y_test], columns=["Prediction", "Actual"]).iloc[:20])


    # Return the metrics
    return mse, rmse, mae, r2, cross_score

In [None]:
def plot_regression_results(y_test, y_pred):

    # Compute residuals
    residuals = y_test - y_pred

    # Set up the figure with 3 subplots
    plt.figure(figsize=(10, 15))

    # True vs. Predicted values plot
    plt.subplot(3, 1, 1)
    plt.scatter(y_test, y_pred, alpha=0.7, edgecolor="k")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.title("True vs. Predicted Values")

    # Residuals vs. Predicted values plot
    plt.subplot(3, 1, 2)
    plt.scatter(y_pred, residuals, alpha=0.7, edgecolor="k")
    plt.axhline(0, color="r", linestyle="--")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residuals vs. Predicted Values")

    # Histogram (and KDE) of residuals
    plt.subplot(3, 1, 3)
    sns.histplot(residuals, kde=True, color="b", bins=20)
    plt.xlabel("Residuals")
    plt.title("Distribution of Residuals")

    plt.tight_layout()
    plt.show()


# Models We Will Apply:

1. **Linear Regression**  
   A simple and interpretable model that assumes a linear relationship between features and the target variable.

2. **Random Forest Regressor**  
   An ensemble learning method that builds multiple decision trees and averages their predictions to improve accuracy and reduce overfitting.

3. **XGBoost Regressor**  
   An optimized gradient boosting algorithm that is known for high performance and efficiency in structured data.

4. **SVM Regressor**  
   Uses Support Vector Machines for regression, aiming to find the best-fit hyperplane within a certain margin of error.

5. **Ensemble Learning**  
   A combination of multiple models to enhance predictive performance by leveraging the strengths of each approach.


## Linear Regression


In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr=lr.predict(X_test)
coeffcients =pd.DataFrame([X.columns,lr.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

In [None]:
# model evaluation
lr_mse, lr_rmse, lr_mae, lr_r2, lr_cross_score=model_evaluation_regression(y_pred_lr, y_test, lr, X_train, y_train)


In [None]:
# Define a list for all modle results
append_results_regression('Linear Regression', lr_mse, lr_rmse, lr_mae, lr_r2, lr_cross_score)


In [None]:
# Plot the results
plot_regression_results(y_test, y_pred_lr)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


importance  = rf.feature_importances_
importance_df=pd.DataFrame({'Column':feature_names,'Importance':importance})
importance_df=importance_df.sort_values(by='Importance',ascending =False)
importance_df

In [None]:
rf_mse, rf_rmse, rf_mae, rf_r2, rf_cross_score=model_evaluation_regression(y_pred_rf, y_test, rf, X_train, y_train)


In [None]:
append_results_regression('Random Forest Regressor', rf_mse, rf_rmse, rf_mae, rf_r2, rf_cross_score)


In [None]:
plot_regression_results(y_test, y_pred_rf)


## XGBRegressor

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

importance  = xgb.feature_importances_
importance_df=pd.DataFrame({'Column':feature_names,'Importance':importance})
importance_df=importance_df.sort_values(by='Importance',ascending =False)
importance_df

In [None]:
xgb_mse, xgb_rmse, xgb_mae, xgb_r2, xgb_cross_score=model_evaluation_regression(y_pred_xgb, y_test, xgb, X_train, y_train)


In [None]:
append_results_regression('XGBRegressor', xgb_mse, xgb_rmse, xgb_mae, xgb_r2, xgb_cross_score)


In [None]:
plot_regression_results(y_test, y_pred_xgb)

## SVM Regressor

In [None]:
from sklearn import svm
svm = svm.SVR()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [None]:
svm_mse, svm_rmse, svm_mae, svm_r2, svm_cross_score=model_evaluation_regression(y_pred_svm, y_test, svm, X_train, y_train)


In [None]:
append_results_regression('SVM Regressor', svm_mse, svm_rmse, svm_mae, svm_r2, svm_cross_score)


In [None]:
plot_regression_results(y_test, y_pred_svm)

In [None]:
pd.DataFrame(results).sort_values(by='R2 Score',ascending=False)

In [None]:
  !git config --global user.email "noumanyousuf0485@gmail.com"
  !git config --global user.name "NoumanYousaf14"



# !git commit -a -m "Boston house data set and price prediction model is done"
