# Module 9: Regression Modelling

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
import zipfile

with zipfile.ZipFile('Downloads\\Boston Housing.zip', 'r') as zip_ref:
    zip_ref.extractall('extract_folder')

In [3]:
df = pd.read_csv('extract_folder/HousingData.csv')

In [4]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [11]:
df = df.dropna(axis=1)

In [14]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [20]:
import statsmodels.api as sm

In [25]:
def backward_elimination(X, y, significance_level=0.05):
    if isinstance(X, pd.DataFrame):
        X_df = X.copy()
    elif isinstance(X, np.ndarray):
        X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
    else:
        raise ValueError("X must be either a Pandas DataFrame or a NumPy array.")
    
    y_df = pd.Series(y, name='MEDV')
    
    num_features = X_df.shape[1]
    
    for i in range(0, num_features):
        X_df = X_df.reset_index(drop=True)  # Reset indices
        y_df = y_df.reset_index(drop=True)
        model = sm.OLS(y_df, X_df).fit()
        max_pvalue = max(model.pvalues)
        
        if max_pvalue > significance_level:
            exclude_feature = model.pvalues.idxmax()
            X_df = X_df.drop(exclude_feature, axis=1)
        else:
            break
    
    return model
