# Linear Regression

## Import Libraries

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import *


## Import dataset

In [2]:
ds = pd.read_csv("../ProcessedDatasets/pro_dataset.csv")

In [3]:
ds['Year']

0         1841
1         1841
2         1841
3         1841
4         1841
          ... 
209846    2022
209847    2022
209848    2022
209849    2022
209850    2022
Name: Year, Length: 209851, dtype: int64

## Predict Population

### Set Variables
<p>Set population as the dependent variable

In [4]:
#ds.iloc[:,5] #= Year -> Independent Variable
#ds.iloc[:,[1,6]] #= County, Population -> Dependent Variables
X= ds.iloc[:,[1,5]]   
y = ds.iloc[:,6]
X

Unnamed: 0,County,Year
0,DONEGAL,1841
1,CAVAN,1841
2,SLIGO,1841
3,ROSCOMMON,1841
4,MAYO,1841
...,...,...
209846,KILDARE,2022
209847,DUBLIN,2022
209848,CARLOW,2022
209849,MONAGHAN,2022


In [None]:
###Note Linear Regression takes a 2d array so reshape x
# -1 indicates length of the second dimension based on the length of the array,  1 indicates  one column.
#X = x.reshape(-1,1)
#x

### Need to encode text values
<p>As county is a categorical variable, one hot encoding will be used to change it to a numerical value

In [5]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import make_column_transformer
one = OneHotEncoder()
#labelencoder = LabelEncoder()
#X[:, 0] = labelencoder.fit_transform(X[:, 0])


In [7]:
#Sets column transformer to apply encoder to county and pass every other column
column_trans = make_column_transformer(
    (one, ['County']),  
    remainder='passthrough'
)

In [8]:
X = column_trans.fit_transform(X)

<p>These where experiments with how to reshape array to pass to algorithm

In [None]:
#X = X[:, 1:]

In [None]:
#X = X.toarray()

In [None]:
#X = np.array(X, dtype=np.float32)

In [None]:
#X[15].astype(int)

### Import SKLEARN train_test_split and split dataset
<p>Split data into training and test sets 80/20

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
from sklearn.linear_model import LinearRegression
regress = LinearRegression()
regress.fit(x_train, y_train)

## Build a pipeline
<p>Build a pipeline to encode columns and apply algorithm</p>
<p>https://www.analyticsvidhya.com/blog/2021/05/understanding-column-transformer-and-machine-learning-pipelines/</p>

In [11]:
from sklearn.pipeline import make_pipeline

In [12]:
pipe = make_pipeline(column_trans,regress)

### Try predict some populations

In [13]:
y_pred = regress.predict(x_test)
y_pred

array([  67850.35420571,  219729.4378891 ,  121157.01078669, ...,
        193507.49532321,  221283.51692474, 1339059.68512294])

In [14]:
x_test[0]

<1x27 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [15]:
#x_test.shape
print(column_trans.transformers_)

[('onehotencoder', OneHotEncoder(), ['County']), ('remainder', 'passthrough', [1])]


<p>

In [16]:
#Getting coefficient m slope
regress.coef_

array([-1.25736435e+05, -1.06261764e+05, -6.33007975e+04,  3.53184291e+05,
       -2.18861216e+04,  1.15408385e+06,  7.18219166e+04, -3.46231204e+04,
        3.57896560e+04, -8.37323802e+04, -9.97447609e+04, -1.47376398e+05,
        1.13836235e+04, -1.39309624e+05, -5.64507185e+04, -4.78057380e+04,
        1.00857398e+04, -1.17125480e+05, -1.02747902e+05, -1.14923870e+05,
       -1.14795513e+05, -2.10099945e+04, -6.66579772e+04, -9.44367207e+04,
       -3.52824015e+04, -4.31413610e+04,  5.18026345e+02])

In [17]:
#Getting the intercept c where x is 0
regress.intercept_

-859365.2773672838

### Metrics R2 Value
<p>Small R squared would suggest good match to the data

In [19]:
#Metrics
# Calculating the R squared value
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9969393509210698

<p>This didn't work, will retry in the next iteration

In [None]:
#from sklearn.model_selection import cross_val_score
#cross_val_score(regress, X,y,cv=5, scoring='accuracy').mean()

## Make predictions on new data

In [394]:
#Make a dataframe
data = {'County': ['DUBLIN'], 'Year': [2010]} 
prediction_input =pd.DataFrame(data)
#prediction_input = prediction_input[:, 0:]

In [395]:
pipe.predict(prediction_input)

array([1335951.88045234])

<p>This prediction is pretty close to the actual figure.

In [25]:
ds[(ds['County']=='DUBLIN') & (ds['Year'] ==2010)]

Unnamed: 0.1,Unnamed: 0,County,Price,Property Size Description,Inflation,Year,Population,Average,AREA,Population Density
19070,19070,DUBLIN,567046.00,greater than 125 sq metres,0.633777,2010,1255890,251976.25,925.760879,1356.6
19071,19071,DUBLIN,255375.00,less than 38 sq metres,0.633777,2010,1255890,251976.25,925.760879,1356.6
19072,19072,DUBLIN,300999.73,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6
19073,19073,DUBLIN,368875.00,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6
19074,19074,DUBLIN,329093.25,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6
...,...,...,...,...,...,...,...,...,...,...
24277,24277,DUBLIN,450000.26,greater than 125 sq metres,0.633777,2010,1255890,251976.25,925.760879,1356.6
24278,24278,DUBLIN,285000.77,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6
24279,24279,DUBLIN,201462.50,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6
24280,24280,DUBLIN,465350.00,greater than or equal to 38 sq metres and less...,0.633777,2010,1255890,251976.25,925.760879,1356.6


In [26]:
#Make a dataframe
data = {'County': ['DUBLIN'], 'Year': [1960]} 
prediction_input =pd.DataFrame(data)
pipe.predict(prediction_input)

array([1310050.20979096])

<p>This prediction is not close at all. This model is trying to fit a line to an exponential curve and is not a good model for solving a question like exponential population growth.</p>
<p>Will try an exponential curve fitting model in next iteration</p>

In [27]:
ds[(ds['County']=='DUBLIN') & (ds['Year'] ==1960)]

Unnamed: 0.1,Unnamed: 0,County,Price,Property Size Description,Inflation,Year,Population,Average,AREA,Population Density
3117,3117,DUBLIN,,,,1960,715821,,925.760879,773.22


## Now prediction of Price based on population density

In [396]:
ds.head()

Unnamed: 0.1,Unnamed: 0,County,Price,Property Size Description,Inflation,Year,Population,Average,AREA,Population Density,Pop Density - Normal,Price-No Inflation,Price by House Size
0,0,DONEGAL,,,,1841,296448.0,,4860.754357,60.99,0.038722,,
1,1,CAVAN,,,,1841,243158.0,,1931.224564,125.91,0.079938,,
2,2,SLIGO,,,,1841,180886.0,,1836.083206,98.52,0.062549,,
3,3,ROSCOMMON,,,,1841,253591.0,,2547.136217,99.56,0.063209,,
4,4,MAYO,,,,1841,388887.0,,5587.525209,69.6,0.044188,,


In [404]:
#Get rid of NaN rows
not_nan = ds[ds['Price by House Size'].notnull()]

In [407]:
I= not_nan.loc[:,['Pop Density - Normal']]   
j = not_nan.loc[:,'Price by House Size']


In [408]:
I_train, I_test, j_train, j_test = train_test_split(I, j, test_size=0.2, random_state=0)

In [411]:
regressDensity = LinearRegression()
regressDensity.fit(I_train, j_train)

In [412]:
price_pred = regressDensity.predict(I_test)

<p> This r squared value seems very far off, however this is only using on independent variable and also there is a large gap between the Dublin area and all other areas. May have to split Dublin off and or include more variable is next iteration </p>

In [413]:
r2_score(I_test, price_pred)

-70541919.08787636