In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

In [2]:
df = pd.read_csv('OnlineNewsPopularity.csv')

In [3]:
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [None]:
df.shape

In [None]:
df.dtypes

# Clean The Data

In [None]:
#Check for missing numbers

missing_values = df[df.isnull().any(axis=1)]
missing_values.head()

In [None]:
max(missing_values.isna().sum())

##### There are no missing values, and the data set appears to be super clean, with dummies already included.

In [4]:
#Reverse-clean for visualization purposes
df_rev = df.copy()
df_rev.columns = df_rev.columns.str.replace(' ','')

In [None]:
df_rev.columns

In [None]:
df_rev['Channel'] = 'Other'

In [None]:
col_channel = {'data_channel_is_lifestyle':'Lifestyle','data_channel_is_entertainment':'Entertainment',
               'data_channel_is_bus':'Business',
               'data_channel_is_socmed':'Social_Media', 'data_channel_is_tech':"Tech",
               'data_channel_is_world':'World'}

for i in col_channel.keys():
    df_rev['Channel'] = np.where(df_rev[i]==1.0, col_channel[i],df_rev['Channel'])


In [None]:
df_rev[df_rev['Channel']=='0']['url']

In [None]:
df_rev['Channel'].value_counts()

In [None]:
day_channel = {'weekday_is_monday':'Monday', 'weekday_is_tuesday':'Tuesday',
       'weekday_is_wednesday':'Wednesday', 'weekday_is_thursday':'Thursday', 'weekday_is_friday':'Friday',
       'weekday_is_saturday':'Saturday', 'weekday_is_sunday':'Sunday'}

df_rev['Day_of_Week'] = None

for i in day_channel.keys():
    df_rev['Day_of_Week'] = np.where(df_rev[i]==1, day_channel[i],df_rev['Day_of_Week'])

In [None]:
cols_drop = ['weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday','data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world']

df_rev.drop(columns=cols_drop, inplace=True)

In [None]:
df_rev.shape



# Visualize

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(),cmap="coolwarm")
plt.title('Correlation matrix',fontsize=24)

plt.show()

In [None]:
df_rev.shape

In [None]:
df_rev['Channel'].isna().sum()

In [None]:
df_agg_C = df_rev.groupby(['Channel'])['shares'].agg(['mean','sum',np.size]).reset_index()

In [None]:
df_agg_C.set_index('Channel',inplace=True)

In [None]:
df_agg_C.sort_values(by='mean',inplace=True, ascending=False)
df_agg_C

In [None]:
cross = pd.crosstab(df_rev.Channel,df_rev.Day_of_Week)


In [None]:
cross.sum()

In [None]:
colors1 = sns.color_palette("GnBu_d", len(df_agg_C.index))
colors2 = sns.color_palette("BuGn_d", len(df_agg_C.index))

fig, [ax1, ax2] = plt.subplots(1,2, figsize=(20,5))
df_agg_C = df_agg_C[df_agg_C.index != 'Other']

ax = sns.barplot(y = df_agg_C.index, x = df_agg_C['mean'], orient='h', palette = colors1, ax=ax1)
ax = sns.barplot(y = df_agg_C.index, x = df_agg_C['size'], orient='h', palette = colors2, ax=ax2)

ax1.set_xlabel(xlabel='Average Number of Shares per Article', fontsize=16)
ax1.set_ylabel(ylabel="Channel", fontsize=16)
ax1.set_title(label="Average Number of Shares on Social per Article, by Channel", fontsize=22)

ax2.set_xlabel(xlabel='Number of Articles', fontsize=16)
ax2.set_ylabel(ylabel="Channel", fontsize=16)
ax2.set_title(label="Count of Articles", fontsize=22)


plt.show()

In [None]:
df_agg2 = df_rev.groupby(['Day_of_Week'])['shares'].agg(['mean','sum',np.size])
df_agg2 = df_agg2.sort_values('mean',ascending=False)

In [None]:
df_agg2

In [None]:
sns.set()

In [None]:
colors1 = sns.color_palette("Blues_d", len(df_agg2.index))
colors2 = sns.color_palette("BuGn_d", len(df_agg2.index))

fig, [ax1, ax2] = plt.subplots(1,2, figsize=(20,5))

plt.figure(figsize=(16,8))

ax = sns.barplot(y = df_agg2.index, x = df_agg2['mean'], orient='h', palette = colors1, ax=ax1)
ax = sns.barplot(y = df_agg2.index, x = df_agg2['size'], orient='h', palette = colors2, ax=ax2)

ax1.set_xlabel(xlabel='Average Number of Sharesper Article, by Day of Week Released', fontsize=16)
ax1.set_ylabel(ylabel="Day", fontsize=16)
ax1.set_title(label="Average Number of Shares on Social per Article", fontsize=24)

ax2.set_xlabel(xlabel='Number of Articles by Day Released', fontsize=16)
ax2.set_ylabel(ylabel="Day", fontsize=16)
ax2.set_title(label="Count of Articles", fontsize=24)


plt.show()

In [None]:
cross = cross[['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]
plt.figure(figsize=(16,8))
sns.heatmap(cross,cmap="viridis", annot=True, fmt="d")
plt.title('Total Number of Articles',fontsize=24)

plt.show()

In [None]:
df_rev1 = df_rev[df_rev['Channel']!='Other']

ch = ['Entertainment','Business','Tech','Lifestyle','World','Social_Media']


medians = df_rev1.groupby(['Channel'])['shares'].median()
medians = medians[ch].values

median_labels = [str(np.round(s, 2)) for s in medians]


means = df_rev1.groupby(['Channel'])['shares'].mean()
means = means[ch].values
mean_labels = [str(np.round(s, 2)) for s in means]

plt.figure(figsize=(18,9))

ax = sns.boxplot(x=df_rev1['Channel'], y=df_rev1['shares'],data=df_rev1, hue='is_weekend', palette='RdPu', 
                 showfliers=False, showmeans=True, linewidth=3.)
ax.set_ylabel(ylabel='# Shares on Social', fontsize=20)
ax.set_title(label='Article Shares by Channel', fontsize=20)

leg = ax.get_legend()

new_title = 'Weekend?'
leg.set_title(new_title)
new_labels = ['No', 'Yes']
for t, l in zip(leg.texts, new_labels): t.set_text(l)
    
plt.setp(ax.get_legend().get_texts(), fontsize='18') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='22') # for legend title

pos = range(len(medians))
for tick,label in zip(pos,ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick], f'med={median_labels[tick]}', 
            horizontalalignment='center', size='x-small', color='c', weight='semibold',fontsize=20)
    ax.text(pos[tick], means[tick], f'mean={mean_labels[tick]}', 
            horizontalalignment='center', size='x-small', color='g', weight='bold',fontsize=20)

plt.show()

In [None]:
plt.figure(figsize=(16,8))



ax = sns.scatterplot(x='shares',y='n_tokens_content',data=df_rev, hue='is_weekend') #Hue
ax.set_ylabel(ylabel='Article Word Count', fontsize=16)
ax.set_xlabel(xlabel='Number of Shares', fontsize=16)
ax.set_title(label='Shares by Length of Article', fontsize=20)


plt.show()

In [None]:
plt.figure(figsize=(16,8))



ax = sns.scatterplot(x='shares',y='num_videos',data=df_rev, hue='is_weekend') #Hue
ax.set_ylabel(ylabel='Article Video Count', fontsize=16)
ax.set_xlabel(xlabel='Number of Shares', fontsize=16)
ax.set_title(label='Shares by Number of Videos', fontsize=20)


plt.show()

In [None]:
plt.figure(figsize=(16,8))



ax = sns.scatterplot(x='shares',y='num_imgs',data=df_rev, hue='is_weekend') #Hue
ax.set_ylabel(ylabel='Article Image Count', fontsize=16)
ax.set_xlabel(xlabel='Number of Shares', fontsize=16)
ax.set_title(label='Shares by Number of Images', fontsize=20)


plt.show()

In [None]:
df_rev['num_imgs'].describe()

# Remove Outliers

In [None]:
#def iqr_fun():
    
for i in ['shares']:
    Q25 = df_rev[i].quantile(0.25)
    Q75 = df_rev[i].quantile(0.75)
    IQR = Q75-Q25
    sMaxQ = Q75+1.5*IQR
    sMinQ = max(Q25-1.5*IQR,0.0)
    print(sMinQ, sMaxQ)

In [None]:
#df_revO = df_rev[(df_rev['shares']<MaxQ) & (df_rev['shares']>MinQ)  ]

In [None]:
df_rev.shape

In [None]:
df_rev['shares'].describe()

In [None]:
plt.figure(figsize=(16,8))

df1 = df_rev[(df_rev['shares']<sMaxQ) & (df_rev['shares']>sMinQ)  ]

ax= df1['shares'].hist(bins=54)
   
ax.set_ylabel('Articles', fontsize=20)
ax.set_xlabel('Number of Shares', fontsize=20)

plt.show()

#### Number of Words in Article

In [None]:
#def iqr_fun():
    
for i in ['n_tokens_content']:
    Q25 = df_rev[i].quantile(0.25)
    Q75 = df_rev[i].quantile(0.75)
    IQR = Q75-Q25
    MaxQ = Q75+1.5*IQR
    MinQ = max(Q25-1.5*IQR,0.0)
    print(MinQ, MaxQ)

In [None]:
df_revO = df_rev[(df_rev['n_tokens_content']<MaxQ) & (df_rev['n_tokens_content']>MinQ)  ]

In [None]:
df_rev.shape

In [None]:
plt.figure(figsize=(16,8))


ax= df_revO['n_tokens_content'].hist(bins=54)
   
ax.set_ylabel('Articles', fontsize=20)
ax.set_xlabel('Number of Words', fontsize=20)

plt.show()

In [None]:
for i in ['num_videos']:
    Q25 = df_rev[i].quantile(0.25)
    Q75 = df_rev[i].quantile(0.75)
    IQR = Q75-Q25
    MaxQ = Q75+2*IQR
    MinQ = max(Q25-1.5*IQR,0.0)
    print(MinQ, MaxQ)

In [None]:
df_revO = df_revO[(df_revO['num_videos']<=MaxQ) & (df_revO['num_videos']>=MinQ)  ]

In [None]:
df_revO.shape

In [None]:
plt.figure(figsize=(16,8))

ax= df_revO['num_videos'].hist(bins=4)
   
ax.set_ylabel('Articles', fontsize=20)
ax.set_xlabel('Number of Videos', fontsize=20)

plt.show()

# Develop Model for Analysis

In [None]:
df.columns = df.columns.str.replace(' ','')

In [None]:
df.columns

In [None]:
dff = df.copy()

def iqr_fun(i,dff):
    Q25 = dff[i].quantile(0.25)
    Q75 = dff[i].quantile(0.75)
    IQR = Q75-Q25
    MaxQ = round(Q75+1.5*IQR,0)+1
    MinQ = round((Q25-1.5*IQR),0)-1
    print(MinQ, MaxQ)
    return [MinQ, MaxQ]

In [None]:
dff.shape

In [None]:
#dff['kw_min_min'].value_counts()

In [None]:
columns_out = ['n_tokens_title','n_tokens_content','num_hrefs','num_self_hrefs','num_imgs','num_videos','num_keywords',
              'kw_min_min', 'kw_max_min',"kw_avg_min", "kw_min_max", "kw_max_max","kw_avg_max", "kw_min_avg",
               "kw_max_avg","kw_avg_avg","self_reference_min_shares","self_reference_max_shares",
               "self_reference_avg_sharess"]

for n in columns_out:
    print(n)
    
    q = iqr_fun(n,dff)
    print(q)
    
    dff = dff[(dff[n]<=q[1]) & (dff[n]>=q[0])  ]
    
    #print(dff.shape)

In [None]:
dff.shape

# Categorize Shares into 3 Bins. 

In [None]:
dff = dff[dff.shares<=sMaxQ+1000]

In [None]:
dff.shares.describe()

In [None]:
dff.reset_index(drop=True, inplace=True)

In [None]:
dff.drop(['url','timedelta'],axis=1,inplace=True)

In [None]:
ser, bins = pd.qcut(dff["shares"], [0, 0.333,0.6667,1], retbins=True, labels=['poor','good','viral'])

#ser, bins = pd.qcut(dff["shares"], 4, retbins=True, labels=['poor','good','great','viral'])

In [None]:
#dff['share_cat'] = pd.cut(dff["shares"], bins=bins, labels=['poor','good','great','viral'], include_lowest=True)
dff['share_cat'] = pd.cut(dff["shares"], bins=bins, labels=[0,1,2], include_lowest=True)

In [None]:
dff.head()

In [None]:
dff.drop(['shares'],axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

X_tr1, X_t1, y_tr1,y_t1 = train_test_split(dff.drop('share_cat',axis=1), dff['share_cat'], test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression

log2_model = LogisticRegression(solver='liblinear')

log2_model.fit(X_tr1,y_tr1)

y_p_tr1= log2_model.predict(X_tr1)
y_p_t1= log2_model.predict(X_t1)

In [None]:
from sklearn.metrics import accuracy_score
acc_score_tr1 = accuracy_score(y_tr1, y_p_tr1)
acc_score_t1 = accuracy_score(y_t1, y_p_t1)

print("Acc Score on traning set: {}".format(acc_score_tr1))
print("Acc Score on testing set: {}".format(acc_score_t1)) 
print("")

from sklearn.metrics import balanced_accuracy_score

bacc_score_tr1 = balanced_accuracy_score(y_tr1, y_p_tr1)
bacc_score_t1 = balanced_accuracy_score(y_t1, y_p_t1)

print("Balanced Acc Score on traning set: {}".format(bacc_score_tr1))
print("Balanced Acc Score on testing set: {}".format(bacc_score_t1)) 


In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_t1, y_p_t1)

In [None]:
from sklearn import metrics


### Model is bad. Try first doing some feature selection.

In [None]:
log2_model2 = LogisticRegression(multi_class='multinomial', solver='newton-cg') #lbfgs/liblinear is default

In [None]:
from sklearn.feature_selection import RFE

In [None]:
rfe1 = RFE(log2_model2,n_features_to_select=6)

In [None]:
rfe1.fit(X_tr1,y_tr1)
print(rfe1.ranking_)
print(rfe1.support_)

In [None]:
cols = X_tr1.columns[rfe1.support_]
cols

In [None]:
X_tr2, X_t2, y_tr2, y_t2 = train_test_split(dff[cols], dff['share_cat'], test_size=0.20, 
                                                    random_state=0)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
log2_model2.fit(X_tr2,y_tr2)

y_p_tr2= log2_model2.predict(X_tr2)
y_p_t2= log2_model2.predict(X_t2)

acc_score_tr2 = accuracy_score(y_tr2, y_p_tr2)
acc_score_t2 = accuracy_score(y_t2, y_p_t2)

print("Acc Score on traning set: {}".format(acc_score_tr2))
print("Acc Score on testing set: {}".format(acc_score_t2)) 
print("")

bacc_score_tr2 = balanced_accuracy_score(y_tr2, y_p_tr2)
bacc_score_t2 = balanced_accuracy_score(y_t2, y_p_t2)

print("Balanced Acc Score on traning set: {}".format(bacc_score_tr2))
print("Balanced Acc Score on testing set: {}".format(bacc_score_t2)) 
print("")

precision_tr2 = precision_score(y_tr2, y_p_tr2, average='micro')
precision_t2 = precision_score(y_t2, y_p_t2,average='micro')

print("Precision Score on training set: {}".format(precision_tr2))
print("Precision Score on testing set: {}".format(precision_t2)) 
print("")

recall_tr2 = recall_score(y_tr2, y_p_tr2, average='micro')
recall_t2 = recall_score(y_t2, y_p_t2, average='micro')

print("Recall Score on training set: {}".format(recall_tr2))
print("Recall Score on testing set: {}".format(recall_t2)) 
print("")

f1_score_tr = f1_score(y_tr2, y_p_tr2, average='micro')
f1_score_t = f1_score(y_t2, y_p_t2, average='micro')
               
print("F1 Score on training set: {}".format(f1_score_tr))
print("F1 Score on testing set: {}".format(f1_score_t)) 

In [None]:
confusion_matrix(y_t2, y_p_t2)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_t2, y_p_t2))

In [None]:
# What if we eliminated NO outliers...

In [None]:
df1 = df.copy()
df1.drop(['url','timedelta'],axis=1,inplace=True)
ser, bins = pd.qcut(df1["shares"], [0,0.3333,0.6667,1.0], retbins=True, labels=['poor','good','viral'])
df1['share_cat'] = pd.cut(df1["shares"], bins=bins, labels=[0,1,2], include_lowest=True)
df1.head()

In [None]:
df1.drop(['shares'],axis=1,inplace=True)

In [None]:
X_tr3a, X_t3a, y_tr3a, y_t3a = train_test_split(df1.drop('share_cat',axis=1), df1['share_cat'], test_size=0.20, 
                                                    random_state=0)

In [None]:
log_model3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
rfe3 = RFE(log_model3,n_features_to_select=6)
rfe3.fit(X_tr3a,y_tr3a)
print(rfe3.ranking_)
print(rfe3.support_)

In [None]:
cols = X_tr3a.columns[rfe3.support_]
X_tr3, X_t3, y_tr3, y_t3 = train_test_split(df1[cols], df1['share_cat'], test_size=0.20, 
                                                    random_state=0)


cols

In [None]:
log_model3.fit(X_tr3,y_tr3)

In [None]:
y_p_tr3= log_model3.predict(X_tr3)
y_p_t3= log_model3.predict(X_t3)

acc_score_tr3 = accuracy_score(y_tr3, y_p_tr3)
acc_score_t3 = accuracy_score(y_t3, y_p_t3)

print("Acc Score on traning set: {}".format(acc_score_tr3))
print("Acc Score on testing set: {}".format(acc_score_t3)) 
print("")

bacc_score_tr3 = balanced_accuracy_score(y_tr3, y_p_tr3)
bacc_score_t3 = balanced_accuracy_score(y_t3, y_p_t3)

print("Balanced Acc Score on traning set: {}".format(bacc_score_tr3))
print("Balanced Acc Score on testing set: {}".format(bacc_score_t3)) 
print("")

precision_tr3 = precision_score(y_tr3, y_p_tr3, average='micro')
precision_t3 = precision_score(y_t3, y_p_t3, average='micro')

print("Precision Score on training set: {}".format(precision_tr3))
print("Precision Score on testing set: {}".format(precision_t3)) 
print("")

recall_tr3 = recall_score(y_tr3, y_p_tr3, average='micro')
recall_t3 = recall_score(y_t3, y_p_t3, average='micro')

print("Recall Score on training set: {}".format(recall_tr3))
print("Recall Score on testing set: {}".format(recall_t3)) 
print("")

f1_score_tr = f1_score(y_tr3, y_p_tr3, average='micro')
f1_score_t = f1_score(y_t3, y_p_t3, average='micro')
               
print("F1 Score on training set: {}".format(f1_score_tr))
print("F1 Score on testing set: {}".format(f1_score_t)) 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_t3, y_p_t3))

In [None]:
confusion_matrix(y_t3, y_p_t3)

# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

ks_rf = RandomForestClassifier().fit(X_tr3,y_tr3)

In [None]:
y_p_t_RF = ks_rf.predict(X_t3)
confusion_matrix(y_t3,y_p_t_RF)

## Random Forest with Outliers Removed

In [None]:
df2 = df.copy()

columns_out = ['n_tokens_title','n_tokens_content','num_hrefs','num_self_hrefs','num_imgs','num_videos','num_keywords',
              'kw_min_min', 'kw_max_min',"kw_avg_min", "kw_min_max", "kw_max_max","kw_avg_max", "kw_min_avg",
               "kw_max_avg","kw_avg_avg","self_reference_min_shares","self_reference_max_shares",
               "self_reference_avg_sharess"]

for n in columns_out:
    print(n)
    
    q = iqr_fun(n,df2)
    df2 = df2[(df2[n]<=q[1]) & (df2[n]>=q[0])  ]

In [None]:
df2.columns

In [None]:
df2 = df2[df2.shares<=sMaxQ+1000]
df2.reset_index(drop=True, inplace=True)


ser, bins = pd.qcut(df2["shares"], [0,0.333,0.6667,1], retbins=True, labels=['poor','good','viral'])
df2['share_cat'] = pd.cut(df2["shares"], bins=bins, labels=[0,1,2], include_lowest=True)
df2.head()

In [None]:
df2.drop(['shares'],axis=1,inplace=True)

In [None]:
X_tr4a, X_t4a, y_tr4a,y_t4a = train_test_split(df2.drop('share_cat',axis=1), df2['share_cat'], test_size=0.2)

In [None]:
log_model4 = LogisticRegression(multi_class='multinomial', solver='lbfgs')

rfe4 = RFE(log_model4,n_features_to_select=8)
rfe4.fit(X_tr4a,y_tr4a)
print(rfe4.ranking_)
print(rfe4.support_)

In [None]:
cols = X_tr4a.columns[rfe4.support_]
X_tr4, X_t4, y_tr4, y_t4 = train_test_split(df2[cols], df2['share_cat'], test_size=0.20, 
                                                    random_state=0)


cols

In [None]:


ks_rf4 = RandomForestClassifier().fit(X_tr4,y_tr4)

In [None]:
y_p_t_RF4 = ks_rf4.predict(X_t4)
confusion_matrix(y_t4,y_p_t_RF4)

# Support Vector Machines

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(gamma='auto')
clf.fit(X_tr1, y_tr1)

In [None]:
clf.score(X_t1, y_t1)