In [119]:
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

%matplotlib inline

In [120]:
# for now, df is created from original dataset
# should be replaced by cleaned version once quality plan is complete
df = pd.read_csv('OnlineNewsPopularity.csv', skipinitialspace=True)
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df = df.drop(['url', 'timedelta'], axis=1)
# drop non-predictive features

In [121]:
# calculate 75:25 split
df.shape[0] * .75

29733.0

In [122]:
# shuffle rows of data set and split into training and testing sets
# because we have large dataset, cross-validation should not be needed
# will use a 75:25 split
df = shuffle(df)
df_train = df[:29733]
df_test = df[29733:]

In [123]:
df.columns

Index(['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
       'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
       'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negat

In [124]:
# for now, just a random selection of features. just to lay out the skeleton for a fit
lm = sm.ols(formula="shares ~ n_tokens_title + num_keywords + kw_avg_avg + title_sentiment_polarity", data=df_train).fit()

In [125]:
lm.params

Intercept                  -929.786797
n_tokens_title               41.121569
num_keywords                120.820045
kw_avg_avg                    0.965756
title_sentiment_polarity    359.476498
dtype: float64

In [126]:
# keep in mind the value range for shares when interpreting coefficients, i suppose.
# quite different to the 0 or 1 we were dealing with before. 
df['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [127]:
lm.summary()
# from these random features, there are no notably high p-values. 
# the r-squared value is low, as might be expected from random features.
# we need to develop a good understanding of these metrics, especially p-values,
# r-squared and coef

0,1,2,3
Dep. Variable:,shares,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,88.38
Date:,"Sun, 22 Apr 2018",Prob (F-statistic):,8.52e-75
Time:,14:59:17,Log-Likelihood:,-321000.0
No. Observations:,29733,AIC:,642000.0
Df Residuals:,29728,BIC:,642000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-929.7868,461.388,-2.015,0.044,-1834.128,-25.445
n_tokens_title,41.1216,32.445,1.267,0.205,-22.472,104.715
num_keywords,120.8200,35.905,3.365,0.001,50.444,191.196
kw_avg_avg,0.9658,0.053,18.273,0.000,0.862,1.069
title_sentiment_polarity,359.4765,258.395,1.391,0.164,-146.990,865.943

0,1,2,3
Omnibus:,81598.269,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4325522322.611
Skew:,34.508,Prob(JB):,0.0
Kurtosis:,1870.278,Cond. No.,22900.0


In [128]:
# test accuracy of the model
# to assess accuracy for a regression model (in addition to the r-squared value auto-generated above),
# we can find the mean squared error (or root mean squared error)
# we could also convert to a classification problem (by setting threshold of popularity and dividing 
# target into 1 for popular and 0 for unpopular) and then getting the accuracy score from predictions
# on a sample from the training set.

In [129]:
df_train.shape

(29733, 59)

In [130]:
# for reference
df_train['shares'].describe()

count     29733.000000
mean       3417.181818
std       11883.883700
min           1.000000
25%         944.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [131]:
train_predictions_sample = df_train[:1000]
predictions = lm.predict(train_predictions_sample)
predictions.describe()
# these predictions are obviously very poor with the current random features model.

count     1000.000000
mean      3414.867094
std       1470.394487
min        -32.661669
25%       2644.326393
50%       3153.067442
75%       3912.186534
max      26580.254524
dtype: float64

In [132]:
# get mean squared error
mse = mean_squared_error(df_train['shares'][:1000], predictions)
mse
# something has gone wrong here, obviously. mse is gigantic

44570515.599714607

In [133]:
# root mean squared error for comparison
sqrt(mse)
# need to understand more about these numbers and discuss

6676.115307550837

In [134]:
# turn into classification problem and test accuracy score
# for now, just using a 3395 shares threshold for classification as popular (above the mean value).  
# or could only could extremely high share values as being 'viral' (val 1) and the rest being 
# 'not viral' (val 0)
predictions = predictions.apply(lambda res: 1 if res > 3395 else 0)
# calculate predictions for 'shares' on a sample of the set
predictions.value_counts()

0    604
1    396
dtype: int64

In [135]:
#df_train['shares_binary'] = df_train['shares'].apply((lambda res: 1 if res > 2800 else 0), axis = 1)
# above doesn't work because the single col 'shares' is a series, not a dataframe. axis arg makes no sense
# maybe use df_train[['shares']] instead, which should be a dataframe with just one col

#df_train['shares_binary'] = df_train[['shares']].apply(lambda res: 1 if res > 3395 else 0)
df_train['shares_binary'] = np.where(df_train['shares'] >=3395, 1, 0)
df_train['shares_binary'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    23679
1     6054
Name: shares_binary, dtype: int64

In [136]:
accuracy_score(df_train['shares_binary'][:1000], predictions) 
# this figure looks not too bad, but it's only predicting membership of the top 25% (if i've dont it right)
# look at other thresholds etc. 

0.64100000000000001

In [137]:
cm = confusion_matrix(df_train['shares_binary'][:1000], predictions)
cm

array([[521, 276],
       [ 83, 120]])

In [138]:
tn, fp, fn, tp = cm.ravel()
print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ',  fn, '\nTrue positives: ', tp)

True negatives:  521 
False positives:  276 
False negatives:  83 
True positives:  120


In [139]:
print(classification_report(df_train['shares_binary'][:1000], predictions))

             precision    recall  f1-score   support

          0       0.86      0.65      0.74       797
          1       0.30      0.59      0.40       203

avg / total       0.75      0.64      0.67      1000



In [140]:
# discuss above metrics. repeat with hold-out set

In [141]:
test_predictions = lm.predict(df_test)
test_predictions.describe()

count     9911.000000
mean      3447.463792
std       1371.482008
min         47.391188
25%       2668.745764
50%       3167.094040
75%       3926.766970
max      35178.736616
dtype: float64

In [149]:
# get mean squared error
mse = mean_squared_error(df_test['shares'][:1000], predictions)
mse

56544275.872000001

In [151]:
rmse = sqrt(mse)
rmse
# again, this seems like a more realistic figure. look into what's going on

7519.592799613553

In [142]:
test_predictions = test_predictions.apply(lambda res: 0 if res<3395 else 1)

In [144]:
df_test['shares_binary'] = np.where(df_test['shares'] >=3395, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [146]:
accuracy_score(df_test['shares_binary'], test_predictions)

0.63706992230854609

In [147]:
confusion_matrix(df_test['shares_binary'], test_predictions)

array([[5090, 2796],
       [ 801, 1224]])

In [148]:
print(classification_report(df_test['shares_binary'][:1000], predictions))

             precision    recall  f1-score   support

          0       0.81      0.61      0.70       806
          1       0.20      0.42      0.27       194

avg / total       0.69      0.57      0.61      1000

