# Linear Regression

To predict our rental housing prices, we will begin with a linear regression model.

In [6]:
"""
A simple linear regression model with all predictors and target variable of
price of unit area.
"""

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os
import pandas as pd
import pickle

In [2]:
# importing csv dataset
df = pd.read_csv("../../data/processed/tpdata.csv")
dummies = pd.get_dummies(df,drop_first=True)
dummies.isna().sum()
dummies.dropna(inplace=True)

# split data into "X" and "y" set
X = dummies.drop('price_unit_area', axis=1)
y = dummies["price_unit_area"]

# splitting our data into a training and testing set,
#  with 25% of data in the test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)


In [3]:
# making linear regression object
reg = LinearRegression()

# fitting model to data
reg.fit(X_train,y_train)

In [4]:
# TODO: score model on X_train & y_train
r2 = (y_test, y_train)
print("R^2:", r2)

# TODO: generate predictions on X_test
y_pred = reg.predict(X_test)
# score MSE 
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

R^2: (73     20.0
287    32.9
289    44.5
33     49.3
43     34.1
       ... 
46     42.0
94     41.0
254    51.8
343    46.6
399    37.3
Name: price_unit_area, Length: 81, dtype: float64, 359    24.7
239    29.7
3      54.8
18     42.3
137    47.4
       ... 
74     54.4
110    51.6
276    34.0
354    30.1
106    47.1
Name: price_unit_area, Length: 322, dtype: float64)
MSE: 283.20611544589667


In [7]:
# Define the directory
dir_name = "code/model/saved_models/"

# Create the directory if it does not exist
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# Now you can save the model
pickle.dump(reg, open(dir_name + "linreg.sav", 'wb'))