# Linear regression:
### In this notebook, we will experiment with various regression approaches.

In [1]:
# load the necessary packages
import numpy as np
import pandas as pd
import operator
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.model_selection import train_test_split 
import pylab as pl
plt.style.use('ggplot')

In [2]:
df_communities = pd.read_csv('../../data/communities/communities.csv', sep = ',', na_values = ['?'])
df_communities.head()

Unnamed: 0,state,communityname,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,ViolentCrimesPerPop
0,8,Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,...,0.0,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.2
1,53,Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,...,0.0,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.67
2,24,Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,...,0.0,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.43
3,34,Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,...,0.0,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.12
4,42,Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,...,0.0,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.03


In [3]:
df_communities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 100 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   state                  1994 non-null   int64  
 1   communityname          1994 non-null   object 
 2   population             1994 non-null   float64
 3   householdsize          1994 non-null   float64
 4   racepctblack           1994 non-null   float64
 5   racePctWhite           1994 non-null   float64
 6   racePctAsian           1994 non-null   float64
 7   racePctHisp            1994 non-null   float64
 8   agePct12t21            1994 non-null   float64
 9   agePct12t29            1994 non-null   float64
 10  agePct16t24            1994 non-null   float64
 11  agePct65up             1994 non-null   float64
 12  numbUrban              1994 non-null   float64
 13  pctUrban               1994 non-null   float64
 14  medIncome              1994 non-null   float64
 15  pct

<p>The data set has a total of 100 columns of which 1 is categorical and the rest are numerical attributes. </p>

In [4]:
df_communities.shape

(1994, 100)

<p>So, we have 1994 data points with 100 columns.</p>

#### let's find out if the data set has any missing values by determining the number of missing values in each column

In [5]:
df_na_sum = df_communities.isna().sum()

# let's do a subselection an donly extract out where the values are greater than 0
print(df_na_sum[df_na_sum > 0]/len(df_communities))

OtherPerCap    0.000502
dtype: float64


<p>There is one attribute, OtherPerCap, with 0.05% missing values.</p>

### Handling missing values by imputing those values based on the variable means.

In [6]:
# Calculate the OtherPerCap mean and use it to fill in the missing values.
otherPerCap_mean = df_communities['OtherPerCap'].mean()
df_communities['OtherPerCap'].fillna(otherPerCap_mean, axis = 0, inplace = True)
df_communities.head()

Unnamed: 0,state,communityname,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,ViolentCrimesPerPop
0,8,Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,...,0.0,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.2
1,53,Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,...,0.0,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.67
2,24,Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,...,0.0,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.43
3,34,Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,...,0.0,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.12
4,42,Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,...,0.0,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.03


In [7]:
### Validate if the missing values within the OtherPerCap attributes have been replaced by the mean of the variable.
df_na_sum = df_communities.isna().sum()
print(df_na_sum[df_na_sum > 0]/len(df_communities))

Series([], dtype: float64)


<p>We learn that they are no any missing values.</p>

### Compute and display basic statistics for the variables in the data set.

In [8]:
np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=7) # setting my printing preferences. 
df_communities.describe()

Unnamed: 0,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,ViolentCrimesPerPop
count,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,...,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0
mean,28.683551,0.057593,0.463395,0.179629,0.753716,0.153681,0.144022,0.424218,0.493867,0.336264,...,0.022778,0.215552,0.608892,0.53505,0.626424,0.65153,0.065231,0.232854,0.161685,0.237979
std,16.397553,0.126906,0.163717,0.253442,0.244039,0.208877,0.232492,0.155196,0.143564,0.166505,...,0.1004,0.231134,0.204329,0.181352,0.200521,0.198221,0.109459,0.203092,0.229055,0.232985
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,0.01,0.35,0.02,0.63,0.04,0.01,0.34,0.41,0.25,...,0.0,0.06,0.47,0.42,0.52,0.56,0.02,0.1,0.02,0.07
50%,34.0,0.02,0.44,0.06,0.85,0.07,0.04,0.4,0.48,0.29,...,0.0,0.13,0.63,0.54,0.67,0.7,0.04,0.17,0.07,0.15
75%,42.0,0.05,0.54,0.23,0.94,0.17,0.16,0.47,0.54,0.36,...,0.0,0.28,0.7775,0.66,0.77,0.79,0.07,0.28,0.19,0.33
max,56.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<p>Here, we get the mean, standard deviation, min, max, 1 percentile, median and 3rd percentile for all the numerical attributes of our data set.</p>

### Separate the target attribute for regression from the rest of the attributes of the data set

In [9]:
vs_target = df_communities['ViolentCrimesPerPop']
vs_target.head() # display the first 5 records

0    0.20
1    0.67
2    0.43
3    0.12
4    0.03
Name: ViolentCrimesPerPop, dtype: float64

In [10]:
df_communities.head()

Unnamed: 0,state,communityname,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,ViolentCrimesPerPop
0,8,Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,...,0.0,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.2
1,53,Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,...,0.0,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.67
2,24,Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,...,0.0,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.43
3,34,Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,...,0.0,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.12
4,42,Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,...,0.0,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.03


### Let's drop the Communityname and ViolentCrimesPerPop from the data set.

In [11]:
fieldsToDrop = ['ViolentCrimesPerPop','communityname']

In [12]:
### let's drop ViolentCrimesPerPop attribute from the data set. 
df_communities.drop(columns=fieldsToDrop, inplace = True)

In [13]:
print(vs_target.shape)
df_communities.shape

(1994,)


(1994, 98)

<p>The data set now has 1994 data points with 98 attributes. </p>

In [14]:
# first five records of the data set
df_communities.head() 

Unnamed: 0,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans
0,8,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,...,0.04,0.0,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2
1,53,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,...,0.0,0.0,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45
2,24,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,...,0.0,0.0,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02
3,34,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,...,0.0,0.0,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28
4,42,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,...,0.0,0.0,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02


### Create a 20% - 80% randomized split of the data set by using the train_test_split module from the sklearn.model_selection package.
#### Set a side 20% test portion and 80% training data partition

In [15]:
# train_test_split shuffles the data and splits it into two split for us into 80% training and 20% percent testing.
vs_train_communities, vs_test_communities, vs_target_train, vs_target_test = train_test_split(df_communities, vs_target, test_size = 0.2, random_state = 33)
print(vs_test_communities.shape)
vs_test_communities[0:5] # first five records

(399, 98)


Unnamed: 0,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans
1158,4,0.0,0.46,0.01,0.97,0.09,0.04,0.4,0.26,0.2,...,0.0,0.0,0.18,0.13,0.73,0.76,0.55,0.04,0.06,0.03
1079,25,0.01,0.35,0.01,0.96,0.04,0.04,0.32,0.45,0.29,...,0.0,0.0,0.11,0.86,0.66,0.88,0.8,0.03,0.11,0.01
1633,36,0.51,0.31,0.6,0.46,0.06,0.09,0.42,0.54,0.39,...,0.17,0.01,0.13,0.8,0.61,0.87,0.84,0.12,0.68,0.75
1700,48,0.02,0.52,0.25,0.63,0.03,0.31,0.54,0.62,0.44,...,0.01,0.0,0.13,0.72,0.35,0.5,0.7,0.13,0.04,0.01
1956,37,0.03,0.37,0.4,0.68,0.03,0.01,0.39,0.47,0.34,...,0.01,0.01,0.03,0.78,0.59,0.71,0.69,0.06,0.1,0.0


In [16]:
print(vs_train_communities.shape)
vs_train_communities[0:5] # the first five records

(1595, 98)


Unnamed: 0,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans
1184,34,0.01,0.54,0.02,0.91,0.27,0.04,0.37,0.41,0.25,...,0.01,0.0,0.29,0.42,0.72,0.74,0.62,0.01,0.2,0.47
401,55,0.99,0.42,0.59,0.44,0.11,0.11,0.44,0.56,0.37,...,0.3,0.12,0.14,0.71,0.52,0.79,0.75,0.28,0.55,0.62
1864,34,0.01,0.53,0.02,0.95,0.15,0.03,0.27,0.37,0.18,...,0.01,0.0,0.17,0.54,0.38,0.24,0.51,0.11,0.03,0.09
1390,25,0.07,0.41,0.02,0.97,0.05,0.02,0.32,0.49,0.31,...,0.0,0.0,0.13,0.88,0.76,0.74,0.84,0.05,0.27,0.4
1066,37,0.05,0.41,0.11,0.85,0.23,0.03,0.32,0.52,0.26,...,0.0,0.0,0.17,0.33,0.2,0.23,0.05,0.09,0.12,0.01


In [17]:
print(vs_target_train.shape)
vs_target_train[0:5] # the first five records

(1595,)


1184    0.08
401     0.40
1864    0.02
1390    0.18
1066    0.07
Name: ViolentCrimesPerPop, dtype: float64

In [18]:
print(vs_target_test.shape)
print(vs_target_test[0:5]) # the first five records

(399,)
1158    0.06
1079    0.14
1633    0.80
1700    0.50
1956    0.30
Name: ViolentCrimesPerPop, dtype: float64
