# DAT-NYC-37 | Codealong 07 | Introduction to Regression and Model Fit, Part 2 | Answer Key

In [1]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

## Activity | Model's F-statistic

In [2]:
df = pd.read_csv(os.path.join('..', '..', '07', 'datasets', 'zillow-07-start.csv'), index_col = 'ID')

IOError: File ../../07/datasets/zillow-07-start.csv does not exist

In [None]:
model = smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit()

model.summary()

## Accessing the model's F-value and its p-value

### F-value (with significance level of `5%`)

In [None]:
model.fvalue

### Corresponding p-value

In [None]:
model.f_pvalue

## Part A - Linear Modeling with `sklearn`

In [None]:
subset_df = df.dropna(axis = 'index', subset = ['Size', 'LotSize', 'IsAStudio'])

In [None]:
def linear_modeling_with_sklearn(X, y):
    model = linear_model.LinearRegression(fit_intercept = True)
    model.fit(X, y)

    print 'F-statistic (performed for each regressor independently)'
    print '- F-value', feature_selection.f_regression(X, y)[0]
    print '- p-value', feature_selection.f_regression(X, y)[1]
    print 'R^2 =', model.score(X, y)
    print 'Coefficients'
    print '- beta_0 (intercept) =', model.intercept_
    print '- beta_n (n > 0)     =', model.coef_

### SalePrice ~ IsAStudio with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = subset_df).fit().summary()

### SalePrice ~ IsAStudio with `sklearn` (Simple Linear Modeling)

In [None]:
X = subset_df[ ['IsAStudio'] ]
y = subset_df.SalePrice

linear_modeling_with_sklearn(X, y)

### SalePrice ~ Size + LotSize with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = subset_df).fit().summary()

### SalePrice ~ IsAStudio with `sklearn` (Multiple Linear Modeling)

In [None]:
X = subset_df[ ['Size', 'LotSize'] ]
y = subset_df.SalePrice

linear_modeling_with_sklearn(X, y)

# Advertising dataset

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [None]:
df

## Plots

### Sales ~ TV

In [None]:
sns.lmplot('TV', 'Sales', df)

### Sales ~ Radio

In [None]:
sns.lmplot('Radio', 'Sales', df)

### Sales ~ Newspaper

In [None]:
sns.lmplot('Newspaper', 'Sales', df)

## Simple linear regressions

### Sales ~ TV

In [None]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

In [None]:
model_tv.summary()

### Sales ~ Radio

In [None]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

In [None]:
model_radio.summary()

### Sales ~ Newspaper

In [None]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

In [None]:
model_newspaper.summary()

## Residuals

### Sales ~ TV

In [None]:
figure = sm.qqplot(model_tv.resid, line = 's')

In [None]:
figure = sm.graphics.plot_regress_exog(model_tv, 'TV')

### Sales ~ Radio

In [None]:
figure = sm.qqplot(model_radio.resid, line = 's')

In [None]:
figure = sm.graphics.plot_regress_exog(model_radio, 'Radio')

### Sales ~ Newspaper

In [None]:
figure = sm.qqplot(model_newspaper.resid, line = 's')

In [None]:
figure = sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

### Sales ~ TV + Radio + Newspaper

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + Newspaper', data = df).fit()

In [None]:
model.summary()

### Sales ~ TV + Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio', data = df).fit()

In [None]:
model.summary()

In [None]:
figure = sm.qqplot(model.resid, line = 's')

In [None]:
figure = sm.graphics.plot_regress_exog(model, 'TV')

In [None]:
figure = sm.graphics.plot_regress_exog(model, 'Radio')

## Part B - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

In [None]:
model.summary()

In [None]:
figure = sm.qqplot(model.resid, line = 's')

In [None]:
figure = sm.graphics.plot_regress_exog(model, 'TV')

In [None]:
figure = sm.graphics.plot_regress_exog(model, 'Radio')

In [None]:
df

In [None]:
figure = sm.graphics.plot_regress_exog(model, 'TV:Radio')

## Part C - Binary/Dummy Variables

In [None]:
df = pd.read_csv(os.path.join('..', '..', '07', 'datasets', 'zillow-07-start.csv'), index_col = 'ID')

In [None]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [None]:
smf.ols(formula = 'SalePrice ~ BathCount', data = df).fit().summary()

### What's the bathrooms' distribution in the dataset?

In [None]:
print np.nan, df.BathCount.isnull().sum()
for bath_count in np.sort(df.BathCount.dropna().unique()):
    print bath_count, len(df[df.BathCount == bath_count])

### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [None]:
df = df[df.BathCount.isin([1, 2, 3, 4])]

In [None]:
print np.nan, df.BathCount.isnull().sum()
df.BathCount.value_counts()

### We can create the binary variables manually

In [None]:
df['Bath_1'] = 0
df.loc[df.BathCount == 1, 'Bath_1'] = 1

df['Bath_2'] = 0
df.loc[df.BathCount == 2, 'Bath_2'] = 1

df['Bath_3'] = 0
df.loc[df.BathCount == 3, 'Bath_3'] = 1

df['Bath_4'] = 0
df.loc[df.BathCount == 4, 'Bath_4'] = 1

In [None]:
df.columns

### But we can also use `get_dummies` from `pandas` as well (on `BedCount` for the sake of variety)

In [None]:
beds_df = pd.get_dummies(df.BedCount, prefix = 'Bed')

In [None]:
beds_df

In [None]:
beds_df.rename(columns={'Bed_1.0': 'Bed_1',
                        'Bed_2.0': 'Bed_2',
                        'Bed_3.0': 'Bed_3',
                        'Bed_4.0': 'Bed_4',
                        'Bed_5.0': 'Bed_5',
                        'Bed_6.0': 'Bed_6',
                        'Bed_7.0': 'Bed_7',
                        'Bed_8.0': 'Bed_8',
                        'Bed_9.0': 'Bed_9'}, inplace = True)

In [None]:
beds_df

In [None]:
df = df.join([beds_df])

In [None]:
df.columns

### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_3 + Bath_4', data = df).fit().summary()

### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_4', data = df).fit().summary()

### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()