In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Define the problem

**Given**: describe the data  
**Goal**: what will you try to infer from that data  

%%% Example %%%  
**Given**: historical data of how much dollars a company spent on ads in different media (TV, radio, newspapers) and the respective sales.  

**Goal**: predict what will be the sales given a particular add spendings.
    

## Prepare the data (Data Preprocessing)

### Load the data

In [2]:
path_to_file = "./datasets/Advertising/Advertising.csv"
df = pd.read_csv(path_to_file, index_col=0)

FileNotFoundError: File b'./datasets/Advertising/Advertising.csv' does not exist

### Get insight of the data (Prepare and clean)

In [None]:
print(df.head(5))

print(df.shape)
# print(data_df.columns.values.tolist())

**TV**:  dollars spent on TV ads for a single product (in thousands)  
**radio**: dollars spent on radio ads (in thousands)  
**newspaper**: dollars spent on newspaper ads (in thousands)    

**sales**: sales of a single product in a given market (in thousands)

In [None]:
df.groupby('sales').max().sort_index(ascending=False).head(5)

#### Clean

In [None]:
df.isnull().sum()

#### Visualize with Seaborn

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(df, 
             x_vars=['TV','radio','newspaper'],              
             y_vars='sales',
             palette=sns.hls_palette(6, l=.6, s=.8),             
             markers=["o"],
             plot_kws={'line_kws':{'color':'#FFAAAA'},
                       'scatter_kws':{'facecolors':'#AAFFAA'}},
             size=5, aspect=1,kind='reg')


In [None]:
# show correlations
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True, cmap="Reds")


## Select features

From the data inspection being made, I'll decide to use the 'TV' ads as a feature.    

In [None]:
X = df[['TV', 'newspaper', 'radio']]
y = df['sales']
print(y.shape)

## Separate the training data from the test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(f'X_train: {X_train[:5]}\n', f'y_train: {y_train[:5]}\n')
print(f'X_test: {X_test[:5]}\n', f'y_test: {y_test[:5]}\n')

## Choose the model

We saw some linear correlation between 'TV' ads and sales, that's why we are going to start with LinearRegression.

In [None]:
from sklearn.linear_model import LinearRegression

## Train the model (fit the model)

In [None]:
# instantiate the model
model = LinearRegression()

In [None]:
fitted = model.fit(X_train,y_train)

In [None]:
# let's check the "learned" co-efficients:
print(fitted.intercept_)
print(fitted.coef_)

### Interpret the coefficients

## Predict (classify unknown input sample)

In [None]:
y_pred = fitted.predict(X_test)

## Evaluate the model

In [None]:
from sklearn import metrics

In [None]:
# get metrics:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
r2 = metrics.r2_score(y_test, y_pred)

# print it
print('{:10s}: {}'.format('MAE: ', mae ) )
print('{:10s}: {}'.format('MSE: ', mse ) )
print('{:10s}: {}'.format('RMSE: ', rmse) )
print('{:10s}: {}'.format('r2_score', r2) )