In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt


# Predicting article popularity on Facebook, Google+, and LinkedIn

### Download data
Source: http://archive.ics.uci.edu/ml/datasets/News+Popularity+in+Multiple+Social+Media+Platforms

In [None]:
%%bash

wget -nc -O news-final.csv http://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/News_Final.csv; 
wget -nc -O facebook-microsoft.csv http://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/Facebook_Microsoft.csv;
wget -nc -O googleplus-microsoft.csv http://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/GooglePlus_Microsoft.csv;
wget -nc -O linkedin-microsoft.csv http://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/LinkedIn_Microsoft.csv &


### Load data and set index

In [None]:
news = pd.read_csv('news-final.csv')
news = news[news['Topic']=='microsoft']
news = news.drop('Topic', 1)

news['IDLink'] = news['IDLink'].astype(int)
news = news.set_index('IDLink')


facebook = pd.read_csv('facebook-microsoft.csv')
facebook = facebook.set_index('IDLink')

google = pd.read_csv('googleplus-microsoft.csv')
google = google.set_index('IDLink')

linkedin = pd.read_csv('linkedin-microsoft.csv')
linkedin = linkedin.set_index('IDLink')


### Restrict to articles appearing on all 3 sites

In [None]:
posted_everywhere = news[['Facebook', 'GooglePlus', 'LinkedIn']].min(axis=1) > -1

news = news[posted_everywhere]

### Restrict to sources with at least 50 articles

In [None]:
greater_than_50 = (news['Source'].value_counts() > 50)

sources_50 = greater_than_50[greater_than_50 > 0].index

print("Number of sources with at least 50 articles: {}".format(greater_than_50.sum()))

news = news[news['Source'].isin(sources_50)]


### Process dates

In [None]:
news.loc[:, 'PublishDate'] = pd.to_datetime(news['PublishDate'])

news.loc[:, 'Year'] = news['PublishDate'].dt.year
news.loc[:, 'Month'] = news['PublishDate'].dt.month
news.loc[:, 'DayOfWeek'] = news['PublishDate'].dt.weekday
news.loc[:, 'Hour'] = news['PublishDate'].dt.hour

In [None]:
import datetime

In [None]:
news['PublishDate'].max()

may_1_2016 = datetime.datetime(2016, 5, 1)

news_historical = news[news['PublishDate'] <= may_1_2016]
news_new = news[news['PublishDate'] > may_1_2016]

In [None]:
news_new.to_csv('news-new.csv')

## Descriptive Analytics

In [None]:
news.head(2)

**Year published**

In [None]:
news['PublishDate'].dt.year.value_counts().plot.bar()

**Day of week**

In [None]:
news['PublishDate'].dt.weekday.value_counts().plot.bar()

**Sentiment**

In [None]:
news['ones'] = 1

In [None]:
news.plot.scatter('SentimentTitle', 'ones', alpha=0.1, figsize=(14, 0.5))

plt.axvline(news['SentimentTitle'].mean(), alpha = 0.4, ls='--')

In [None]:
news.head()

In [None]:
news.plot.scatter(x='SentimentTitle', y='Facebook')

## ML

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import base

In [None]:
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names
        return X[self.col_names].values  #REMOVE

In [None]:
df = news_historical.drop(['PublishDate', 'Headline'], 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bag_of_words_vectorizer = CountVectorizer(min_df = 0.05, max_df = 0.95)

counts = bag_of_words_vectorizer.fit_transform(df.head()['Title'])

In [None]:
cols = bag_of_words_vectorizer.get_feature_names()

pd.DataFrame(counts.toarray(), columns=cols)

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
source_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['Source'])),
    ('oh', OneHotEncoder())
]) 


counts_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['Title'])),
    ('fn', FunctionTransformer(lambda x: x.reshape(-1))),
    ('counts', TfidfVectorizer(min_df = 0.01, max_df = 0.9))
])

hour_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['Hour'])),
    ('oh', OneHotEncoder())
]) 

sentiment_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['SentimentTitle', 'SentimentHeadline']))
])

features = FeatureUnion([
    ('source', source_pipe),
    ('hour', hour_pipe),
    ('counts', counts_pipe),
    ('sentiment', sentiment_pipe)
])


features.fit_transform(df)




In [None]:
facebook_model = Pipeline([
    ('features',counts_pipe),
    ('model', LinearRegression())
])

facebook_model.fit(df, df['Facebook'])

google_model = Pipeline([
    ('features', features),
    ('model', LinearRegression())
])

google_model.fit(df, df['GooglePlus'])

linkedin_model = Pipeline([
    ('features', features),
    ('model', LinearRegression())
])

linkedin_model.fit(df, df['LinkedIn'])

## Validate models

In [None]:
df_new = news_new.copy()

In [None]:
word_features = list(zip(facebook_model.named_steps['features'].named_steps['counts'].get_feature_names(), facebook_model.named_steps['model'].coef_))

sorted(word_features, key=lambda x: x[1])

In [None]:
print('Facebook Model')
print('training: {}'.format(facebook_model.score(df, df['Facebook'])))
print('testing: {}'.format(facebook_model.score(df_new, df_new['Facebook'])))

In [None]:
print('Google Model')
print('training: {}'.format(google_model.score(df, df['GooglePlus'])))
print('testing: {}'.format(google_model.score(df_new, df_new['GooglePlus'])))

In [None]:
print('LinkedIn Model')
print('training: {}'.format(linkedin_model.score(df, df['LinkedIn'])))
print('testing: {}'.format(linkedin_model.score(df_new, df_new['LinkedIn'])))

## Use models

In [None]:
facebook_predict = facebook_model.predict(df_new)

google_predict = google_model.predict(df_new)

linkedin_predict = linkedin_model.predict(df_new)

In [None]:
df_new['FacebookPrediction'] = facebook_predict

df_new['GooglePrediction'] = google_predict

df_new['LinkedInPrediction'] = linkedin_predict

### Compare predicted resutls

In [None]:
df_new[['Title', 'Source', 'PublishDate', 'FacebookPrediction', 'GooglePrediction', 'LinkedInPrediction']]\
.head()