In [27]:
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
from sklearn.metrics import accuracy_score

%matplotlib inline

In [10]:
# for now, df is created from original dataset
# should be replaced by cleaned version once quality plan is complete
df = pd.read_csv('OnlineNewsPopularity.csv', skipinitialspace=True)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [11]:
# shuffle rows of data set and split into training and testing sets
# because we have large dataset, cross-validation should not be needed
# will use a 75:25 split
df = shuffle(df)
df_train = df[:750]
df_test = df[750:]

In [12]:
df.columns

Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
     

In [17]:
# for now, just a random selection of features. just to lay out the skeleton for a fit
lm = sm.ols(formula="shares ~ n_tokens_title + num_keywords + kw_avg_avg + title_sentiment_polarity", data=df_train).fit()

In [18]:
lm.params

Intercept                  -1422.532301
n_tokens_title               -21.150326
num_keywords                 214.079386
kw_avg_avg                     1.189886
title_sentiment_polarity    1585.407156
dtype: float64

In [21]:
# keep in mind the value range for shares when interpreting coefficients, i suppose.
# quite different to the 0 or 1 we were dealing with before.
df['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [22]:
lm.summary()
# from these random features, n_tokens_title has a notably high p-value. 
# the r-squared value is low, as might be expected from random features.
# we need to develop a good understanding of these metrics, especially p-values,
# r-squared and coef

0,1,2,3
Dep. Variable:,shares,R-squared:,0.024
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,4.643
Date:,"Sun, 22 Apr 2018",Prob (F-statistic):,0.00104
Time:,10:58:04,Log-Likelihood:,-7971.9
No. Observations:,750,AIC:,15950.0
Df Residuals:,745,BIC:,15980.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1422.5323,2460.650,-0.578,0.563,-6253.166,3408.101
n_tokens_title,-21.1503,173.498,-0.122,0.903,-361.754,319.454
num_keywords,214.0794,192.597,1.112,0.267,-164.018,592.177
kw_avg_avg,1.1899,0.306,3.886,0.000,0.589,1.791
title_sentiment_polarity,1585.4072,1357.351,1.168,0.243,-1079.282,4250.096

0,1,2,3
Omnibus:,1160.615,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,370126.259
Skew:,9.058,Prob(JB):,0.0
Kurtosis:,110.312,Cond. No.,23000.0


In [25]:
# test accuracy of the model
# to assess accuracy for a regression model (in addition to the r-squared value auto-generated above),
# we can find the mean squared error (or root mean squared error)
# we could also convert to a classification problem (by setting threshold of popularity and dividing 
# target into 1 for popular and 0 for unpopular) and then getting the accuracy score from predictions
# on a sample from the training set.
train_predictions_sample = lm.predict(df_train[:100])
train_predictions_sample
# calculate predictions for 'shares' on a small sample of the set

7466     4600.134541
34904    3689.425692
2374     2586.939639
4984     5416.731725
6625     4233.134402
27684    4191.055560
25535    2475.832400
2275     3287.025294
7481     4213.155944
4293     5689.855926
30741    3967.930427
12002    3921.965412
24740    3321.479245
24877    3091.522475
12677    2493.215465
33505    2372.767244
31317    3453.349246
31295    2587.860645
18901    5674.256428
37994    3084.118586
5244     3025.138295
4520     3334.498217
8483     6411.737306
2230     3353.717545
9667     4118.818230
35496    4100.439230
30547    3752.927774
5731     4301.618814
19390    2671.187352
7640     5672.493546
8726     3180.086296
25600    6288.264303
14382    3133.507895
28525    3774.255372
17160    2514.883155
17025    2372.348868
3907     2822.497077
3438     3102.075674
27731    4027.761339
33386    2084.973363
27052    6393.645513
16107    2385.720072
17184    4590.791004
17716    8655.767735
17898    3459.369713
3430     2978.685910
39338    4250.583294
2316     5430

In [28]:
accuracy_score(df_train['shares'][:100], train_predictions_sample) 

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets