In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import matplotlib.pyplot as plt

# Upload Real Estate Data

In [None]:
# Load Real_estate.csv dataset into sctipt - Be sure to use local path!
df = pd.read_csv('Real_estate.csv')
df

# Make Edits to Data & Check for missing Values

In [None]:
# Remove unwanted columns
# No column is just a value thats different in every row and would not help in the training process

print('Shape before edit:', df.shape)
del df['No']
print('Shape after edit:', df.shape)

#Check for missing values
print('\nChecking for any missing values in columns')
df.isna().sum()



# Define predictor & target variables

In [None]:
#Define X and Y
print('Dataset shape: ', df.shape)
X = df.drop(['Y house price of unit area'],axis=1).values
Y = df['Y house price of unit area'].values

#New sets of data
print('X (predictor variables) shape:       ', X.shape)
print('Y (target variable)     shape:       ',Y.shape)

# Regression Model Performance Function

In [None]:
# Function gets the score/accuracy - based on model selected
def get_score(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    return model.score(X_test, Y_test)

# Plotting model for graphs

In [None]:
def plot_model(X_train, X_test, Y_train, Y_test ):
  lin = LinearRegression()
  lin.fit(X_train, Y_train)
  
  lasso = linear_model.Lasso()
  lasso.fit(X_train, Y_train)
  
  ridge = linear_model.Ridge()
  ridge.fit(X_train, Y_train)
  
  elNet = linear_model.ElasticNet()
  elNet.fit(X_train, Y_train)

  lin_y_train_pred = lin.predict(X_train)
  lin_y_test_pred = lin.predict(X_test)

  lasso_y_train_pred = lasso.predict(X_train)
  lasso_y_test_pred = lasso.predict(X_test) 

  ridge_y_train_pred = ridge.predict(X_train)
  ridge_y_test_pred = ridge.predict(X_test)

  elNet_y_train_pred = elNet.predict(X_train)
  elNet_y_test_pred = elNet.predict(X_test)


  _,ax = plt.subplots(2,4, figsize =(15,10), sharex=True, sharey=True)
  ax[0,0].set_title("LinReg Train")
  ax[0,0].scatter(Y_train, lin_y_train_pred, marker="o",alpha=0.5)
  ax[1,0].set_title('LinReg Test')
  ax[1,0].scatter(Y_test, lin_y_test_pred,  marker="o",color='g', label='LinReg Test', alpha=0.5)
  ax[0,1].set_title("Lasso Train")  
  ax[0,1].scatter(Y_train, lasso_y_train_pred,  marker='^',label='Lasso Train', alpha=0.5)
  ax[1,1].set_title("Lasso Test")
  ax[1,1].scatter(Y_test, lasso_y_test_pred, marker="^", color='g', label='Lasso Test', alpha=0.5)
  ax[0,2].set_title("Ridge Train")
  ax[0,2].scatter(Y_train, ridge_y_train_pred,  marker="s",  label='Ridge Train', alpha=0.5)
  ax[1,2].set_title("Ridge Test")
  ax[1,2].scatter(Y_test, ridge_y_test_pred,  marker="s", color='g', label='Ridge Test', alpha=0.5)
  ax[0,3].set_title("ElNet Train")
  ax[0,3].scatter(Y_train, elNet_y_train_pred,  marker="x", label='ElNet Train', alpha=0.5)
  ax[1,3].set_title("ElNet Test")
  ax[1,3].scatter(Y_test, elNet_y_test_pred,  marker="x", color='g',label='ElNet Test',alpha=0.5)
  
  
  plt.suptitle('Actual vs Predicted - New Fold')
  plt.show()
  lin_mse = np.mean(( lin_y_test_pred - Y_test)**2)
  lasso_mse = np.mean(( lasso_y_test_pred - Y_test)**2)
  ridge_mse = np.mean(( ridge_y_test_pred - Y_test)**2)
  elNet_mse = np.mean(( elNet_y_test_pred - Y_test)**2)
  print(f'\n  LinearReg MSE: {lin_mse:0.3}\t\t  Lasso MSE: {lasso_mse:0.3}\t\t Ridge MSE: {ridge_mse:0.3}\t\tElasticNet MSE: {elNet_mse:0.3}\n\n')



# MSE function

In [None]:
def mse(X_train,  X_test, Y_train, Y_test, model ):
    model.fit(X_train,Y_train)
    Y_test_pred = model.predict(X_test)
    mse_val = np.mean(( Y_test_pred - Y_test)**2)
    return mse_val

# Split Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,random_state=100)

# Algorithm for all regression models

In [None]:
ml_scores = []
lasso_scores = []
ridge_scores = []
elastic_net_scores = []

lin_mse = []
lasso_mse = []
ridge_mse = []
elasticNet_mse = []

kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    # Scores for models
    ml_scores.append(get_score(LinearRegression(), X_train, X_test, Y_train, Y_test))
    lasso_scores.append(get_score(linear_model.Lasso(),  X_train, X_test, Y_train, Y_test))
    ridge_scores.append(get_score(linear_model.Ridge(),  X_train, X_test, Y_train, Y_test))
    elastic_net_scores.append(get_score(linear_model.ElasticNet(),  X_train, X_test, Y_train, Y_test))
    
    # Graphs for models
    plot_model(X_train, X_test, Y_train, Y_test)
    
    # MSE for models
    lin_mse.append(mse(X_train, X_test, Y_train, Y_test, LinearRegression()))
    lasso_mse.append(mse(X_train, X_test, Y_train, Y_test, linear_model.Lasso()))
    ridge_mse.append(mse(X_train, X_test, Y_train, Y_test, linear_model.Ridge()))
    elasticNet_mse.append(mse(X_train, X_test, Y_train, Y_test, linear_model.ElasticNet()))


print("\033[31;1;4mAccuracy measurement each Kfold iteration:\033[0m\n")
print('Linear Regression:', ml_scores, '\nLasso:            ', lasso_scores, '\nRidge:            ', ridge_scores, '\nElastic Net:      ', elastic_net_scores )
print('\n')
print("\033[31;1;4mModel Accuracy after conducting Kfold:\033[0m\n\n")
print('Linear Regression:', np.mean(ml_scores), '\nLasso Regression:', np.mean(lasso_scores),\
      '\nRidge Regression:', np.mean(ridge_scores), '\nElasticNet Regression', np.mean(elastic_net_scores))  

print("\n\033[31;1;4mModel MSE after conducting Kfold:\033[0m\n\n")
print('Linear Regression MSE:', np.mean(lin_mse), '\nLasso Regression MSE:', np.mean(lasso_mse),\
      '\nRidge Regression MSE :', np.mean(ridge_mse), '\nElasticNet Regression MSE:', np.mean(elasticNet_mse))  
print('\n')

In [None]:
from sklearn.model_selection import cross_val_score

print("\n\033[31;1;4mAccuracies for models verified by the sklearn.model_selection.cross_val_score function\033[0m\n\n")
print("Linear Regression:\t  ",np.mean(cross_val_score(LinearRegression(), X, Y)))
print("Lasso Regression:\t  ", np.mean(cross_val_score(linear_model.Lasso(), X, Y)))
print("Ridge Regression:\t  ", np.mean(cross_val_score(linear_model.Ridge(), X, Y)))
print("Elastic Net Regression:   ", np.mean(cross_val_score(linear_model.ElasticNet(), X, Y)))


# Tuning Paramaters for Individual models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
best_acc_model = []

def model_param_tuning(X,Y):
    model_algos = {
        'Linear Regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'Lasso Regression': {
            'model': linear_model.Lasso(),
            'params': {
                'alpha': np.logspace(-10,1,10),
                'normalize': [True,False],
                'selection': ['random','cyclic']
            }
        },
        'Ridge Regression': {
            'model': linear_model.Ridge(),
            'params': {
                'alpha': np.logspace(-10,1,10),
                'normalize': [True,False]
            }
        },
        'ElasticNet Regression': {
            'model': linear_model.ElasticNet(),
            'params': {
                'alpha': np.logspace(-10,1,10),
                'normalize': [True,False],
                'selection': ['random','cyclic']
     
            }
        }   
    }

        
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=100)
    
    for algo_name, config in model_algos.items():
        grid_search = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        grid_search.fit(X,Y)
        scores.append({
            'model':algo_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
    
    tuned = pd.DataFrame(scores,columns=['model','best_score','best_params'])
    best_model = tuned.loc[tuned['best_score'].idxmax()]
    print("\n\033[31;1;4mModel with highest accuracy\033[0m\n\n")
    print("Model name:\t\t     " , best_model['model'])
    print("Best accuracy score:\t     ", best_model['best_score'])
    print("Best parameters:\t     ", best_model['best_params'])
    print('\n')
    print("\033[31;1;4mTable with model scores after hyperparameter tuning\033[0m\n\n")
    return tuned

In [None]:
model_param_tuning(X,Y)