In [79]:
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
from sklearn.metrics import accuracy_score

%matplotlib inline

In [80]:
# for now, df is created from original dataset
# should be replaced by cleaned version once quality plan is complete
df = pd.read_csv('OnlineNewsPopularity.csv', skipinitialspace=True)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [81]:
# calculate 75:25 split
df.shape[0] * .75

29733.0

In [82]:
# shuffle rows of data set and split into training and testing sets
# because we have large dataset, cross-validation should not be needed
# will use a 75:25 split
df = shuffle(df)
df_train = df[:29733]
df_test = df[29733:]

In [83]:
df.columns

Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
     

In [84]:
# for now, just a random selection of features. just to lay out the skeleton for a fit
lm = sm.ols(formula="shares ~ n_tokens_title + num_keywords + kw_avg_avg + title_sentiment_polarity", data=df_train).fit()

In [85]:
lm.params

Intercept                  -896.607270
n_tokens_title               37.557711
num_keywords                114.278407
kw_avg_avg                    0.955637
title_sentiment_polarity    360.535896
dtype: float64

In [86]:
# keep in mind the value range for shares when interpreting coefficients, i suppose.
# quite different to the 0 or 1 we were dealing with before.
df['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [87]:
lm.summary()
# from these random features, there are no notably high p-values. 
# the r-squared value is low, as might be expected from random features.
# we need to develop a good understanding of these metrics, especially p-values,
# r-squared and coef

0,1,2,3
Dep. Variable:,shares,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,115.1
Date:,"Sun, 22 Apr 2018",Prob (F-statistic):,1.2900000000000002e-97
Time:,11:41:15,Log-Likelihood:,-317640.0
No. Observations:,29733,AIC:,635300.0
Df Residuals:,29728,BIC:,635300.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-896.6073,409.022,-2.192,0.028,-1698.308,-94.907
n_tokens_title,37.5577,28.958,1.297,0.195,-19.202,94.318
num_keywords,114.2784,32.069,3.563,0.000,51.421,177.135
kw_avg_avg,0.9556,0.046,20.892,0.000,0.866,1.045
title_sentiment_polarity,360.5359,231.464,1.558,0.119,-93.143,814.215

0,1,2,3
Omnibus:,80465.111,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4809788605.395
Skew:,33.131,Prob(JB):,0.0
Kurtosis:,1972.262,Cond. No.,22900.0


In [88]:
# test accuracy of the model
# to assess accuracy for a regression model (in addition to the r-squared value auto-generated above),
# we can find the mean squared error (or root mean squared error)
# we could also convert to a classification problem (by setting threshold of popularity and dividing 
# target into 1 for popular and 0 for unpopular) and then getting the accuracy score from predictions
# on a sample from the training set.

In [89]:
df_train.shape

(29733, 61)

In [90]:
# for reference
df_train['shares'].describe()

count     29733.000000
mean       3345.254196
std       10635.041227
min          22.000000
25%         945.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [91]:
train_predictions_sample = lm.predict(df_train[:1000])
train_predictions_sample.describe()
# these predictions are obviously very poor with the current random features model.

count     1000.000000
mean      3354.371556
std       1317.541472
min        346.412348
25%       2618.749461
50%       3088.129370
75%       3813.469746
max      23333.810725
dtype: float64

In [92]:
# turn into classification problem and test accuracy score
# Use a 4542 shares threshold for classification as popular (top 25 %). 
# or could only could extremely high share values as being 'viral' (val 1) and the rest being 
# 'not viral' (val 0)
train_predictions_sample = train_predictions_sample.apply(lambda res: 1 if res > 3784 else 0)
train_predictions_sample.value_counts()
# calculate predictions for 'shares' on a sample of the set

0    742
1    258
dtype: int64

In [95]:
df_train['shares_binary'] = df_train['shares'].apply(lambda res: 1 if res > 3784 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [94]:
accuracy_score(df_train['shares_binary'][:1000], train_predictions_sample) 

0.0