In [41]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# import the ML algorithm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
#from pandas.core import datetools

# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics
from statsmodels.tools.eval_measures import rmse

In [42]:
# location = r"E:\MYLEARN\2-ANALYTICS-DataScience\datasets\Advertising.csv"

In [44]:
df_training = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
df_training.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [6]:
# check the shape of the DataFrame (rows, columns)
df_training.shape

(200, 4)

# Handling Categorical Features with 2 Categories

Up to now, all of our features have been numeric. What if one of our features was categorical?

Let's create a new feature called __Size__, and randomly assign observations to be __small or large__:

Size column would indicate - say 'Large' or 'small' market size

In [47]:
nums = np.random.rand(len(df_training))

In [49]:
mask_large = nums > 0.5
mask_large

array([False, False, False, False, False, False,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True, False,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False, False,
       False, False,  True, False, False,  True, False,  True, False,
        True, False,  True, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True, False, False,
        True,  True,  True, False,  True,  True, False, False,  True,
       False,  True, False,  True, False,  True, False,  True,  True,
        True,  True,  True,  True, False,  True, False,  True, False,
       False,  True, False, False, False,  True,  True, False, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True, False, False, False,  True,  True,  True,  True,
        True, False,  True,  True, False, False, False,  True, False,
       False, False,

In [50]:
# set a seed for reproducibility
np.random.seed(12345)

# create a Series of booleans in which roughly half are True
nums = np.random.rand(len(df_training))
mask_large = nums > 0.5

# initially set Size to small, then change roughly half to be large
df_training['Size'] = 'small'

# Series.loc is a purely label-location based indexer for selection by label
df_training.loc[mask_large, 'Size'] = 'large'
df_training.head()

Unnamed: 0,TV,radio,newspaper,sales,Size
1,230.1,37.8,69.2,22.1,large
2,44.5,39.3,45.1,10.4,small
3,17.2,45.9,69.3,9.3,small
4,151.5,41.3,58.5,18.5,small
5,180.8,10.8,58.4,12.9,large


In [51]:
# convert the categorical values to numeric

# create a new Series called Size_large
df_training['Size_large'] = df_training.Size.map({'small': 0, 'large': 1})
df_training.head()

Unnamed: 0,TV,radio,newspaper,sales,Size,Size_large
1,230.1,37.8,69.2,22.1,large,1
2,44.5,39.3,45.1,10.4,small,0
3,17.2,45.9,69.3,9.3,small,0
4,151.5,41.3,58.5,18.5,small,0
5,180.8,10.8,58.4,12.9,large,1


In [52]:
# create a Python list of feature names
feature_cols = ['TV', 'radio', 'newspaper', 'Size_large']

# use the list to select a subset of the original DataFrame
X = df_training[feature_cols]

# select a Series from the DataFrame
y = df_training['sales']

#### Statsmodel package

In [53]:
# create a fitted model with all three features
sm_model = smf.ols(formula='sales ~ TV + radio + newspaper + Size_large', 
              data=df_training).fit()

In [54]:
# print the coefficients
sm_model.params

Intercept     2.911701
TV            0.045720
radio         0.188728
newspaper    -0.001098
Size_large    0.057424
dtype: float64

#### Scikit learn package

In [55]:
# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [56]:
# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)

2.9117013282568642
[ 0.04571982  0.18872814 -0.00109768  0.05742385]


In [23]:
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

[('TV', 0.04571982092436275),
 ('radio', 0.18872814313427855),
 ('newspaper', -0.0010976794483516517),
 ('Size_large', 0.05742385085482757)]

#### Interpreting the Size_large coefficient

For a given amount of TV/Radio/Newspaper ad spending, ($1000) being a large market is associated with an average increase in Sales of 57.42 widgets (as compared to a small market, which is called the baseline level).

# Handling Categorical Features with more than 2 values

Let's create a new feature called __Area__, and randomly assign observations to be __rural, suburban, or urban__:

__Cannot __code it as 

    0 = rural, 
    1 = suburban, 
    2 = urban 

because that would imply an ordered relationship between suburban and urban Urban would be somehow "twice" the suburban category

So we have to create additional __dummy variables__. 

** np.random.rand(len(df_training)) **

Random values in a given shape.

Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1).

In [57]:
# set a seed for reproducibility
np.random.seed(123456)

# assign roughly one third of observations to each group

nums          = np.random.rand(len(df_training))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban    = nums > 0.66

df_training['Area'] = 'rural'

# Series.loc is a purely label-location based indexer for selection by label
df_training.loc[mask_suburban, 'Area'] = 'suburban'
df_training.loc[mask_urban,    'Area'] = 'urban'

df_training.Area.value_counts()

suburban    78
urban       63
rural       59
Name: Area, dtype: int64

In [58]:
# create three 3 variables using get_dummies
pd.get_dummies(df_training.Area, prefix='Area').head()

Unnamed: 0,Area_rural,Area_suburban,Area_urban
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
5,0,1,0


we actually only need 2 dummy variables, not 3. 

Why? 

Because 2 dummies captures all of the "information" about the Area feature, and implicitly defines rural as the "baseline level".

In [35]:
# create 3 dummy variables using get_dummies, then exclude the first dummy column
area_dummies = pd.get_dummies(df_training.Area, prefix='Area').iloc[:, 1:]
area_dummies.head()

Unnamed: 0,Area_suburban,Area_urban
1,0,0
2,0,1
3,0,0
4,0,1
5,1,0


Here is how we interpret the coding:


    - rural    is coded as Area_suburban=0 and Area_urban=0
    - suburban is coded as Area_suburban=1 and Area_urban=0
    - urban    is coded as Area_suburban=0 and Area_urban=1

In [36]:
# concatenate the dummy variable columns onto the DataFrame (axis=0 means rows, axis=1 means columns)
df_training = pd.concat([df_training, area_dummies], axis=1)
df_training.head()

Unnamed: 0,TV,radio,newspaper,sales,Size,Size_large,Area,Area_suburban,Area_urban
1,230.1,37.8,69.2,22.1,large,1,rural,0,0
2,44.5,39.3,45.1,10.4,small,0,urban,0,1
3,17.2,45.9,69.3,9.3,small,0,rural,0,0
4,151.5,41.3,58.5,18.5,small,0,urban,0,1
5,180.8,10.8,58.4,12.9,large,1,suburban,1,0


In [37]:
# create a Python list of feature names
feature_cols = ['TV', 'radio', 'newspaper', 'Size_large', 'Area_suburban', 'Area_urban']

# use the list to select a subset of the original DataFrame
X = df_training[feature_cols]

# select a Series from the DataFrame
y = df_training['sales']

In [38]:
# create a fitted model with all three features
sm_model = smf.ols(formula='sales ~ TV + radio + newspaper + Size_large + Area_suburban + Area_urban', 
              data=df_training).fit()

# print the coefficients
sm_model.params

Intercept        2.874191
TV               0.045744
radio            0.187867
newspaper       -0.001088
Size_large       0.077397
Area_suburban   -0.106563
Area_urban       0.268138
dtype: float64

In [39]:
# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X, y)

# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)

2.8741909890879125
[ 0.04574401  0.1878667  -0.0010877   0.07739661 -0.10656299  0.26813802]


In [40]:
# print the coefficients
list(zip(feature_cols, linreg.coef_))

[('TV', 0.04574401036331372),
 ('radio', 0.18786669552525823),
 ('newspaper', -0.0010876977267108567),
 ('Size_large', 0.07739660749747905),
 ('Area_suburban', -0.10656299015958608),
 ('Area_urban', 0.2681380216522013)]

#### interpret the coefficients

- Holding all other variables fixed, 

> - being a suburban area is associated with an average decrease in Sales of 106.56 widgets (as compared to the baseline level, which is rural).
> - Being an urban area is associated with an average increase in Sales of 268.13 widgets (as compared to rural).