# Linear Regression

## Import Libraries

In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import *


## Import dataset

In [372]:
ds = pd.read_csv("../ProcessedDatasets/pro_dataset.csv")

In [373]:
ds['Year']

0         1841
1         1841
2         1841
3         1841
4         1841
          ... 
209846    2022
209847    2022
209848    2022
209849    2022
209850    2022
Name: Year, Length: 209851, dtype: int64

## Predict Population

### Set Variables

In [374]:
#ds.iloc[:,5] #= Year -> Independent Variable
#ds.iloc[:,[1,6]] #= County, Population -> Dependent Variables
X= ds.iloc[:,[1,5]]   
y = ds.iloc[:,6]
X

Unnamed: 0,County,Year
0,DONEGAL,1841
1,CAVAN,1841
2,SLIGO,1841
3,ROSCOMMON,1841
4,MAYO,1841
...,...,...
209846,KILDARE,2022
209847,DUBLIN,2022
209848,CARLOW,2022
209849,MONAGHAN,2022


In [None]:
###Note Linear Regression takes a 2d array so reshape x
# -1 indicates length of the second dimension based on the length of the array,  1 indicates  one column.
#X = x.reshape(-1,1)
#x

### Need to encode text values

In [375]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import make_column_transformer
one = OneHotEncoder()
#labelencoder = LabelEncoder()
#X[:, 0] = labelencoder.fit_transform(X[:, 0])


In [376]:
column_trans = make_column_transformer(
    (one, ['County']),  
    remainder='passthrough'
)

In [377]:
X = column_trans.fit_transform(X)

In [None]:
#X = X[:, 1:]

In [None]:
#X = X.toarray()

In [None]:
#X = np.array(X, dtype=np.float32)

In [None]:
#X[15].astype(int)

### Import SKLEARN train_test_split and split dataset

In [378]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [379]:
from sklearn.linear_model import LinearRegression
regress = LinearRegression()
regress.fit(x_train, y_train)

## Build a pipeline

In [380]:
from sklearn.pipeline import make_pipeline

In [381]:
pipe = make_pipeline(column_trans,regress)

### Try predict some populations

In [382]:
y_pred = regress.predict(x_test)
y_pred

array([  67850.81116736,  219729.85540032,  121157.39045331, ...,
        193507.61832661,  221283.93643027, 1339060.04251225])

In [383]:
x_test[0]

<1x27 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [384]:
#x_test.shape
print(column_trans.transformers_)

[('onehotencoder', OneHotEncoder(), ['County']), ('remainder', 'passthrough', [1])]


In [385]:
#Getting coefficient c
regress.coef_

array([-1.25736562e+05, -1.06261744e+05, -6.33007418e+04,  3.53184391e+05,
       -2.18862669e+04,  1.15408388e+06,  7.18218395e+04, -3.46233146e+04,
        3.57897501e+04, -8.37322333e+04, -9.97447428e+04, -1.47376280e+05,
        1.13834257e+04, -1.39309559e+05, -5.64507713e+04, -4.78056782e+04,
        1.00855401e+04, -1.17125348e+05, -1.02747804e+05, -1.14923834e+05,
       -1.14795542e+05, -2.10100400e+04, -6.66580143e+04, -9.44366267e+04,
       -3.52823349e+04, -4.31413902e+04,  5.18027010e+02])

In [386]:
#Getting the intercept
regress.intercept_

-859366.2928352329

### Metrics R2 Value

In [387]:
#Metrics
# Calculating the R squared value
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9969393310341512

In [None]:
#from sklearn.model_selection import cross_val_score
#cross_val_score(regress, X,y,cv=5, scoring='accuracy').mean()

## Make predictions on new data

In [394]:
#Make a dataframe
data = {'County': ['DUBLIN'], 'Year': [2010]} 
prediction_input =pd.DataFrame(data)
#prediction_input = prediction_input[:, 0:]

In [395]:
pipe.predict(prediction_input)

array([1335951.88045234])

## Now prediction of Price based on population density

In [396]:
ds.head()

Unnamed: 0.1,Unnamed: 0,County,Price,Property Size Description,Inflation,Year,Population,Average,AREA,Population Density,Pop Density - Normal,Price-No Inflation,Price by House Size
0,0,DONEGAL,,,,1841,296448.0,,4860.754357,60.99,0.038722,,
1,1,CAVAN,,,,1841,243158.0,,1931.224564,125.91,0.079938,,
2,2,SLIGO,,,,1841,180886.0,,1836.083206,98.52,0.062549,,
3,3,ROSCOMMON,,,,1841,253591.0,,2547.136217,99.56,0.063209,,
4,4,MAYO,,,,1841,388887.0,,5587.525209,69.6,0.044188,,


In [404]:
#Get rid of NaN rows
not_nan = ds[ds['Price by House Size'].notnull()]

In [407]:
I= not_nan.loc[:,['Pop Density - Normal']]   
j = not_nan.loc[:,'Price by House Size']


In [408]:
I_train, I_test, j_train, j_test = train_test_split(I, j, test_size=0.2, random_state=0)

In [411]:
regressDensity = LinearRegression()
regressDensity.fit(I_train, j_train)

In [412]:
price_pred = regressDensity.predict(I_test)

In [413]:
r2_score(I_test, price_pred)

-70541919.08787636