In [13]:
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

In [28]:
# for now, df is created from original dataset
# should be replaced by cleaned version once quality plan is complete
df = pd.read_csv('OnlineNewsPopularity.csv', skipinitialspace=True)
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df = df.drop(['url', 'timedelta'], axis=1)
# drop non-predictive features

In [29]:
# calculate 75:25 split
df.shape[0] * .75

29733.0

In [30]:
# shuffle rows of data set and split into training and testing sets
# because we have large dataset, cross-validation should not be needed
# will use a 75:25 split
df = shuffle(df)
df_train = df[:29733]
df_test = df[29733:]

In [31]:
df.columns

Index(['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
       'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
       'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negat

In [32]:
# for now, just a random selection of features. just to lay out the skeleton for a fit
lm = sm.ols(formula="shares ~ n_tokens_title + num_keywords + kw_avg_avg + title_sentiment_polarity", data=df_train).fit()

In [33]:
lm.params

Intercept                  -732.114947
n_tokens_title               38.181469
num_keywords                 86.309166
kw_avg_avg                    0.989290
title_sentiment_polarity    286.973203
dtype: float64

In [34]:
# keep in mind the value range for shares when interpreting coefficients, i suppose.
# quite different to the 0 or 1 we were dealing with before.
df['shares'].describe()

count     39644.000000
mean       3395.380184
std       11626.950749
min           1.000000
25%         946.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [35]:
lm.summary()
# from these random features, there are no notably high p-values. 
# the r-squared value is low, as might be expected from random features.
# we need to develop a good understanding of these metrics, especially p-values,
# r-squared and coef

0,1,2,3
Dep. Variable:,shares,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,94.45
Date:,"Sun, 22 Apr 2018",Prob (F-statistic):,5.65e-80
Time:,13:42:53,Log-Likelihood:,-321480.0
No. Observations:,29733,AIC:,643000.0
Df Residuals:,29728,BIC:,643000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-732.1149,465.541,-1.573,0.116,-1644.595,180.365
n_tokens_title,38.1815,32.932,1.159,0.246,-26.367,102.730
num_keywords,86.3092,36.594,2.359,0.018,14.584,158.034
kw_avg_avg,0.9893,0.052,19.114,0.000,0.888,1.091
title_sentiment_polarity,286.9732,262.731,1.092,0.275,-227.991,801.937

0,1,2,3
Omnibus:,82134.936,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4438681697.575
Skew:,35.128,Prob(JB):,0.0
Kurtosis:,1894.533,Cond. No.,22900.0


In [36]:
# test accuracy of the model
# to assess accuracy for a regression model (in addition to the r-squared value auto-generated above),
# we can find the mean squared error (or root mean squared error)
# we could also convert to a classification problem (by setting threshold of popularity and dividing 
# target into 1 for popular and 0 for unpopular) and then getting the accuracy score from predictions
# on a sample from the training set.

In [37]:
df_train.shape

(29733, 59)

In [38]:
# for reference
df_train['shares'].describe()

count     29733.000000
mean       3412.690008
std       12082.429534
min           1.000000
25%         949.000000
50%        1400.000000
75%        2800.000000
max      843300.000000
Name: shares, dtype: float64

In [39]:
train_predictions_sample = df_train[:1000]
predictions = lm.predict(train_predictions_sample)
predictions.describe()
# these predictions are obviously very poor with the current random features model.

count     1000.000000
mean      3366.799345
std       1292.265821
min        -23.814679
25%       2610.434354
50%       3117.366939
75%       3777.690125
max      18109.299364
dtype: float64

In [41]:
# get mean squared error
mean_squared_error(df_train['shares'][:1000], predictions)
# something has gone wrong here, obviously. mse is gigantic

757701279.57806432

In [44]:
# turn into classification problem and test accuracy score
# Use a 3813 shares threshold for classification as popular (top 25 %). 
# or could only could extremely high share values as being 'viral' (val 1) and the rest being 
# 'not viral' (val 0)
predictions = predictions.apply(lambda res: 1 if res > 3813 else 0)
# calculate predictions for 'shares' on a sample of the set
predictions.value_counts()

0    1000
dtype: int64

In [53]:
#df_train['shares_binary'] = df_train['shares'].apply((lambda res: 1 if res > 3784 else 0), axis = 1)
# above doesn't work because the single col 'shares' is a series, not a dataframe. axis arg makes no sense
# maybe use df_train[['shares']] instead, which should be a dataframe with just one col

#df_train['shares_binary'] = df_train[['shares']].apply(lambda res: 1 if res > 3784 else 0)
df_train['shares_binary'] = np.where(df_train['shares'] >=3784, 1, 0)
df_train['shares_binary'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    24399
1     5334
Name: shares_binary, dtype: int64

In [54]:
accuracy_score(df_train['shares_binary'][:1000], predictions) 

0.81999999999999995

In [55]:
confusion_matrix(df_train['shares_binary'][:1000], predictions)

array([[820,   0],
       [180,   0]])

In [56]:
print(classification_report(df_train['shares_binary'][:1000], predictions))

             precision    recall  f1-score   support

          0       0.82      1.00      0.90       820
          1       0.00      0.00      0.00       180

avg / total       0.67      0.82      0.74      1000



  'precision', 'predicted', average, warn_for)


In [None]:
# discuss above metrics