In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score

In [2]:
# Load the boston dataset

boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
target = pd.DataFrame(boston.target, columns=['Price'])
df = pd.concat([data,target],axis=1)

In [10]:
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [11]:
df.to_csv('boston.csv')

In [13]:
# Split the data into features and target
X = df.drop('Price', axis=1)
y = df['Price']

In [14]:
# Split the data intro train/test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

### Linear Regression

In [19]:
# Apply Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [20]:
# Calculate the R2 score
y_pred = model.predict(X_test)
r2_model = r2_score(y_test, y_pred)
print('R2 Score of the base model is: ', r2_model)

R2 Score of the base model is:  0.5892223849182503


### Ridge & Lasso

In [22]:
from sklearn.linear_model import Ridge, Lasso

# Apply Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_model_ridge = r2_score(y_test, y_pred_ridge)
print('R2 Score of the ridge model with alpha 1.0 is: ', r2_model_ridge)

R2 Score of the ridge model with alpha 1.0 is:  0.5796111714164923


In [23]:
from sklearn.linear_model import Ridge, Lasso

# Apply Ridge Regression
ridge_model = Ridge(alpha=0.5)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_model_ridge = r2_score(y_test, y_pred_ridge)
print('R2 Score of the ridge model with alpha 0.5 is: ', r2_model_ridge)

R2 Score of the ridge model with alpha 0.5 is:  0.5836498210820439


In [24]:
from sklearn.linear_model import Ridge, Lasso

# Apply Ridge Regression
ridge_model = Ridge(alpha=0.3)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_model_ridge = r2_score(y_test, y_pred_ridge)
print('R2 Score of the ridge model with alpha 0.3 is: ', r2_model_ridge)

R2 Score of the ridge model with alpha 0.3 is:  0.5856952293627944


In [29]:
from sklearn.linear_model import Ridge, Lasso

# Apply Ridge Regression
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_model_ridge = r2_score(y_test, y_pred_ridge)
print('R2 Score of the ridge model with alpha 0.1 is: ', r2_model_ridge)

R2 Score of the ridge model with alpha 0.1 is:  0.5880003503393505


In [30]:
from sklearn.linear_model import Ridge, Lasso

# Apply Lasso Regression
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
r2_model_lasso = r2_score(y_test, y_pred_lasso)
print('R2 Score of the lasso model with alpha 1.0 is: ', r2_model_lasso)

R2 Score of the lasso model with alpha 1.0 is:  0.48789271561192604


In [31]:
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [34]:
# Identify the coefficients with bad slope results
bad_features = np.where(lasso_model.coef_ ==0)[0]
print('Features with bad slope results', list(X.columns[bad_features]))

Features with bad slope results ['INDUS', 'CHAS', 'NOX']


In [35]:
# Remove these features
X_train_filtered = X_train.drop(X_train.columns[bad_features],axis=1)
X_test_filtered = X_test.drop(X_test.columns[bad_features],axis=1)

# Applying the Linear & Lasso Regression on filtered data
model_filtered_lr = LinearRegression()
model_filtered_lr.fit(X_train_filtered, y_train)

model_filtered_lasso = Lasso(alpha=0.1)
model_filtered_lasso.fit(X_train_filtered, y_train)

y_pred_lrfiltered = model_filtered_lr.predict(X_test_filtered)
r2_model_lrfiltered = r2_score(y_test, y_pred_lrfiltered)
print('R2 Score of the new Linear regression model is: ', r2_model_lrfiltered)

y_pred_lassofiltered = model_filtered_lasso.predict(X_test_filtered)
r2_model_lassofiltered = r2_score(y_test, y_pred_lassofiltered)
print('R2 Score of the new Lasso regression model is: ', r2_model_lassofiltered)

R2 Score of the new Linear regression model is:  0.5558354547335729
R2 Score of the new Lasso regression model is:  0.5542304065052207


In [40]:
# Identify coefficients with values closer to zero

small_coefficients = np.where((lasso_model.coef_ <0.05) & (lasso_model.coef_>-0.05))[0]

In [41]:
small_coefficients

array([ 2,  3,  4,  6,  9, 11], dtype=int64)

In [42]:
print("Features with very low coefficients: ", list(X.columns[small_coefficients]))

Features with very low coefficients:  ['INDUS', 'CHAS', 'NOX', 'AGE', 'TAX', 'B']


In [45]:
# Removing the small coefficient features
X_train_new = X_train.drop(X_train.columns[small_coefficients],axis=1)
X_test_new = X_test.drop(X_test.columns[small_coefficients],axis=1)

In [47]:
# Apply the Linear Regression
model_filtered_small = Lasso(alpha=0.1)
model_filtered_small.fit(X_train_new, y_train)

y_pred_new = model_filtered_small.predict(X_test_new)
r2_model_new = r2_score(y_test, y_pred_new)
print('R2 Score of the new Lasso regression model is: ', r2_model_new)

R2 Score of the new Lasso regression model is:  0.5163287074198906


In [55]:
from itertools import combinations

best_r2_score = 0
best_column_combination = None

# Iterate through all possible combinations of columns to drop
for r in range(1, len(small_coefficients) + 1):
    for subset in combinations(small_coefficients, r):
        # Create a new X_train with the current subset of columns dropped
        X_train_subset = X_train.drop(X_train.columns[list(subset)], axis=1)
        X_test_subset = X_test.drop(X_test.columns[list(subset)], axis=1)
        
        # Apply linear regression on the subset of columns
        model_subset = LinearRegression()
        model_subset.fit(X_train_subset, y_train)
        
        # Calculate R2 score for the subset of columns
        r2_subset = model_subset.score(X_test_subset, y_test)
        
        # Update the best R2 score and column combination if a higher score is achieved
        if r2_subset > best_r2_score:
            best_r2_score = r2_subset
            best_column_combination = subset

print("Best R2 score:", best_r2_score)
print("Best column combination:", list(best_column_combination))

Best R2 score: 0.5896890888383526
Best column combination: [6]
