In [79]:
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

%matplotlib inline

In [80]:
# for now, df is created from original dataset
# should be replaced by cleaned version once quality plan is complete
df = pd.read_csv('OnlineNewsPopularity.csv', skipinitialspace=True)
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df = df.drop(['url', 'timedelta'], axis=1)
# drop non-predictive features

In [81]:
# calculate 75:25 split
df.shape[0] * .75

29733.0

In [82]:
# shuffle rows of data set and split into training and testing sets
# because we have large dataset, cross-validation should not be needed
# will use a 75:25 split
df = shuffle(df)
df_train = df[:29733]
df_test = df[29733:]

In [83]:
df.columns

Index(['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
       'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
       'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negat

In [84]:
# for now, just a random selection of features. just to lay out the skeleton for a fit
lm = sm.ols(formula="shares ~ n_tokens_title + num_keywords + kw_avg_avg + title_sentiment_polarity", data=df_train).fit()

In [85]:
lm.params

Intercept                  -951.332141
n_tokens_title               55.540904
num_keywords                109.103743
kw_avg_avg                    0.921724
title_sentiment_polarity    311.143841
dtype: float64

In [86]:
# keep in mind the value range for shares when interpreting coefficients, i suppose.
# quite different to the 0 or 1 we were dealing with before. 
df['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [87]:
lm.summary()
# from these random features, there are no notably high p-values. 
# the r-squared value is low, as might be expected from random features.
# we need to develop a good understanding of these metrics, especially p-values,
# r-squared and coef

0,1,2,3
Dep. Variable:,shares,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,114.7
Date:,"Sun, 22 Apr 2018",Prob (F-statistic):,2.96e-97
Time:,14:51:39,Log-Likelihood:,-316630.0
No. Observations:,29733,AIC:,633300.0
Df Residuals:,29728,BIC:,633300.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-951.3321,395.488,-2.405,0.016,-1726.505,-176.159
n_tokens_title,55.5409,27.987,1.985,0.047,0.685,110.397
num_keywords,109.1037,31.042,3.515,0.000,48.260,169.948
kw_avg_avg,0.9217,0.044,20.820,0.000,0.835,1.008
title_sentiment_polarity,311.1438,222.243,1.400,0.162,-124.461,746.749

0,1,2,3
Omnibus:,81983.961,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6297599780.016
Skew:,34.731,Prob(JB):,0.0
Kurtosis:,2256.552,Cond. No.,22900.0


In [88]:
# test accuracy of the model
# to assess accuracy for a regression model (in addition to the r-squared value auto-generated above),
# we can find the mean squared error (or root mean squared error)
# we could also convert to a classification problem (by setting threshold of popularity and dividing 
# target into 1 for popular and 0 for unpopular) and then getting the accuracy score from predictions
# on a sample from the training set.

In [89]:
df_train.shape

(29733, 59)

In [90]:
# for reference
df_train['shares'].describe()

count     29733.000000
mean       3330.728887
std       10278.859645
min           4.000000
25%         943.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [91]:
train_predictions_sample = df_train[:1000]
predictions = lm.predict(train_predictions_sample)
predictions.describe()
# these predictions are obviously very poor with the current random features model.

count     1000.000000
mean      3362.932680
std       1298.718982
min        427.850407
25%       2637.468755
50%       3098.993884
75%       3794.033132
max      19439.053711
dtype: float64

In [92]:
# get mean squared error
mse = mean_squared_error(df_train['shares'][:1000], predictions)
mse
# something has gone wrong here, obviously. mse is gigantic

37383938.013037324

In [93]:
# root mean squared error for comparison
sqrt(mse)
# need to understand more about these numbers and discuss

6114.240591687354

In [94]:
# turn into classification problem and test accuracy score
# Use a 2800 shares threshold for classification as popular (top 25 %). 
# or could only could extremely high share values as being 'viral' (val 1) and the rest being 
# 'not viral' (val 0)
predictions = predictions.apply(lambda res: 1 if res > 3813 else 0)
# calculate predictions for 'shares' on a sample of the set
predictions.value_counts()

0    754
1    246
dtype: int64

In [95]:
#df_train['shares_binary'] = df_train['shares'].apply((lambda res: 1 if res > 2800 else 0), axis = 1)
# above doesn't work because the single col 'shares' is a series, not a dataframe. axis arg makes no sense
# maybe use df_train[['shares']] instead, which should be a dataframe with just one col

#df_train['shares_binary'] = df_train[['shares']].apply(lambda res: 1 if res > 2800 else 0)
df_train['shares_binary'] = np.where(df_train['shares'] >=3784, 1, 0)
df_train['shares_binary'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    24427
1     5306
Name: shares_binary, dtype: int64

In [96]:
accuracy_score(df_train['shares_binary'][:1000], predictions) 
# this figure looks not too bad, but it's only predicting membership of the top 25% (if i've dont it right)
# look at other thresholds etc. 

0.73399999999999999

In [97]:
cm = confusion_matrix(df_train['shares_binary'][:1000], predictions)
cm

array([[652, 164],
       [102,  82]])

In [101]:
tn, fp, fn, tp = cm.ravel()
print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ',  fn, '\nTrue positives: ', tp)

True negatives:  652 
False positives:  164 
False negatives:  102 
True positives:  82


In [102]:
print(classification_report(df_train['shares_binary'][:1000], predictions))

             precision    recall  f1-score   support

          0       0.86      0.80      0.83       816
          1       0.33      0.45      0.38       184

avg / total       0.77      0.73      0.75      1000



In [100]:
# discuss above metrics. repeat with hold-out set

In [64]:
test_predictions = lm.predict(df_test)

In [None]:
test_predictions = test_predictions.apply(lambda res: 0 if res)