# Model Building and Training
## Here:
- Choose appropriate features for the model.
- Train a linear regression model.
- Perform hyperparameter tuning (if applicable).
- Script: scripts/train_model.py   (This file contains necessary reusable codes and function that we use down below)

#### Settings to make scripts in scripts/ accessible.  
- this creates a .vscode in a root that includes the root path as well as __pycache__ in scripts directory

In [25]:
import sys
import os

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join("..", "scripts")))

#### Necessary imports

In [26]:
import pandas as pd
import train_model as tm

#### Load data

In [42]:
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [40]:
# Shape and the basic info of

print(X_train.shape, X_test.shape)

(171, 13) (43, 13)


In [43]:
y_train.shape, y_test.shape

((171, 1), (43, 1))

#### Feature Selection

In [30]:
correlation_matrix = X_train.corr()
correlation_matrix

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
CRIM,1.0,-0.272114,0.314866,,0.459055,-0.115189,0.338929,-0.24523,0.079764,0.243835,0.267449,-0.103918,0.223126
ZN,-0.272114,1.0,-0.363453,,-0.516376,0.306855,-0.318935,0.525464,0.173337,-0.109726,-0.357401,-0.018155,-0.298856
INDUS,0.314866,-0.363453,1.0,,0.593583,-0.223665,0.365387,-0.495283,-0.169954,0.35061,0.317632,-0.039629,0.465009
CHAS,,,,,,,,,,,,,
NOX,0.459055,-0.516376,0.593583,,1.0,-0.226348,0.671047,-0.745127,-0.059918,0.372675,0.383829,0.131604,0.506462
RM,-0.115189,0.306855,-0.223665,,-0.226348,1.0,-0.105569,0.120686,0.044702,-0.111312,-0.190824,-0.10054,-0.600597
AGE,0.338929,-0.318935,0.365387,,0.671047,-0.105569,1.0,-0.538667,-0.049921,0.210306,0.276917,0.076064,0.605092
DIS,-0.24523,0.525464,-0.495283,,-0.745127,0.120686,-0.538667,1.0,0.115835,-0.321346,-0.279301,-0.166402,-0.315683
RAD,0.079764,0.173337,-0.169954,,-0.059918,0.044702,-0.049921,0.115835,1.0,0.296053,-0.07014,-0.20153,-0.03377
TAX,0.243835,-0.109726,0.35061,,0.372675,-0.111312,0.210306,-0.321346,0.296053,1.0,0.172705,-0.033079,0.290194


### Create a model

In [31]:
model = tm.create_regression_model()
model

#### Train a linear regression model

In [35]:
tm.train_regression_model(X_train, y_train,  model)

#### Predict on testing set

In [33]:
# y_preds are the prediction values of the target variables (MEDV in our case) based on the features
#once we have the preds we can compare them to the actual values of the target

y_pred = tm.y_pred(X_test, model)
y_pred

array([[18.94879653],
       [22.28686746],
       [22.09515447],
       [20.43750805],
       [28.68657242],
       [19.61359802],
       [14.75950649],
       [24.88812451],
       [22.49840462],
       [15.57355709],
       [25.28810821],
       [22.34516241],
       [23.87815945],
       [24.85277989],
       [23.66837384],
       [20.75106603],
       [21.61223602],
       [15.94651571],
       [25.21058547],
       [21.14965488],
       [14.134029  ],
       [22.82151877],
       [24.82727994],
       [20.88029553],
       [26.01018554],
       [14.68782167],
       [24.98278028],
       [20.82959094],
       [23.38571835],
       [17.93157639],
       [22.5115658 ],
       [26.86388113],
       [22.43763651],
       [19.62683794],
       [28.95771731],
       [19.4933237 ],
       [23.67663403],
       [21.4062352 ],
       [19.7183789 ],
       [19.74268942],
       [18.9663878 ],
       [20.48077052],
       [26.85675559]])

#### Saving the y_pred to csv file

In [None]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.to_csv('../data/y_pred_df.csv', index=False)

#### - For basic linear regression, there are no hyperparameters to tune since it just fits the data with a least-squares solution.