# Model Building and Training
## Here:
- Choose appropriate features for the model.
- Train a linear regression model.
- Perform hyperparameter tuning (if applicable).
- Script: scripts/train_model.py   (This file contains necessary reusable codes and function that we use down below)

#### Settings to make scripts in scripts/ accessible.  
- this creates a .vscode in a root that includes the root path as well as __pycache__ in scripts directory

In [1]:
import sys
import os

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join("..", "scripts")))

#### Necessary imports

In [2]:
import pandas as pd
import train_model as tm

#### Load data

In [3]:
X_train = pd.read_csv('../data/X_train.csv')
Y_train = pd.read_csv('../data/y_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
Y_test = pd.read_csv('../data/Y_test.csv')

In [4]:
# Shape and the basic info of

print(X_train.shape, Y_train.shape)

(170, 13) (170, 1)


#### Feature Selection

In [5]:
correlation_matrix = X_train.corr()
correlation_matrix

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
CRIM,1.0,-0.26977,0.277244,,0.436256,-0.125216,0.323604,-0.229179,0.090253,0.215644,0.246936,-0.124268,0.222207
ZN,-0.26977,1.0,-0.362244,,-0.516714,0.308285,-0.316202,0.524076,0.172267,-0.104303,-0.354895,-0.01537,-0.297832
INDUS,0.277244,-0.362244,1.0,,0.579277,-0.233399,0.352507,-0.48808,-0.167613,0.330952,0.301719,-0.05373,0.468492
CHAS,,,,,,,,,,,,,
NOX,0.436256,-0.516714,0.579277,,1.0,-0.233678,0.666089,-0.742936,-0.055803,0.356689,0.371624,0.12301,0.508528
RM,-0.125216,0.308285,-0.233399,,-0.233678,1.0,-0.108743,0.1235,0.045398,-0.115962,-0.194973,-0.102193,-0.601831
AGE,0.323604,-0.316202,0.352507,,0.666089,-0.108743,1.0,-0.533785,-0.047036,0.197536,0.2674,0.069782,0.605447
DIS,-0.229179,0.524076,-0.48808,,-0.742936,0.1235,-0.533785,1.0,0.113598,-0.311967,-0.270991,-0.161431,-0.31414
RAD,0.090253,0.172267,-0.167613,,-0.055803,0.045398,-0.047036,0.113598,1.0,0.303777,-0.067178,-0.200184,-0.032842
TAX,0.215644,-0.104303,0.330952,,0.356689,-0.115962,0.197536,-0.311967,0.303777,1.0,0.158029,-0.042828,0.288644


### Create a model

In [6]:
model = tm.create_regression_model()
model

#### Train a linear regression model

In [7]:
tm.train_regression_model(X_train, Y_train,  model)

#### Predict on testing set

In [None]:
# y_preds are the prediction values of the target variables (MEDV in our case) based on the features
#once we have the preds we can compare them to the actual values of the target

y_pred = tm.y_pred(X_test, model)
y_pred

array([[21.9822989 ],
       [18.43332959],
       [22.45322728],
       [23.23217163],
       [21.27813189],
       [20.52682879],
       [20.28426499],
       [21.65225849],
       [20.81707966],
       [23.14057591],
       [23.19478308],
       [21.86818814],
       [21.55746421],
       [24.12809855],
       [23.15171842],
       [21.75274604],
       [23.50908524],
       [21.87559201],
       [24.90508266],
       [23.39280735],
       [22.75858298],
       [22.55508504],
       [22.89461787],
       [23.15432987],
       [23.96813581],
       [22.66042179],
       [23.17424548],
       [21.05472726],
       [22.24477164],
       [21.65971741],
       [22.78290271],
       [22.07932202],
       [24.29865774],
       [26.02703655],
       [21.55946288],
       [21.29729559],
       [22.41103445],
       [22.07828677],
       [25.73600971],
       [21.78969814],
       [23.48675587],
       [20.99489392]])

#### Saving the y_pred to csv file

In [11]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.to_csv('../data/y_pred_df.csv', index=False)

#### - For basic linear regression, there are no hyperparameters to tune since it just fits the data with a least-squares solution.