In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

In [2]:
df = pd.read_json('prepped_data.json').sort_index()

In [3]:
df.head()

Unnamed: 0,age,num_comments,score,text,timestamp,weekday_posted,hour_posted,log_score,log_comments
0,300,1588,21720,China Killing Prisoners To Harvest Organs For ...,2019-06-19 11:49:08,2,11,9.985989,7.370237
1,240,402,2661,Muslim family dragged out of Belgian embassy i...,2019-06-19 12:05:31,2,12,7.886457,5.996477
2,660,3320,46977,Women outperform men after Japan medical schoo...,2019-06-19 05:51:44,2,5,10.757413,8.107723
3,360,202,1474,MH17 crash: Investigators 'to charge four with...,2019-06-19 10:50:51,2,10,7.295735,5.308317
4,660,1336,2665,Iranian official calls on world to unite again...,2019-06-19 05:09:15,2,5,7.887959,7.197443


In [4]:
X = df[['text','age','weekday_posted','hour_posted']]
y = df['log_score']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)


In [6]:
X_train.head()

Unnamed: 0,text,age,weekday_posted,hour_posted
4867,Donald Trump Jr. on Sunday claimed CNN is cove...,1440,0,4
1395,"Oxfam warns of the ""worst cholera outbreak in ...",1440,5,11
2819,Exposure to weed killing products increases ri...,1440,3,11
567,Hong Kong protesters demand China be held to a...,660,0,0
1444,Austrian Government Seeks to Eliminate Interne...,180,4,10


Extracting Embeddings from GloVe

In [7]:
headers = X_train.text
data = headers.map(word_tokenize).values
total_vocabulary = set(word for headline in data for word in headline)

In [8]:
glove = {}
with open('/Users/patrickfuller/flatiron/glove/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [9]:
# LETS BORROW SOME LOVELY CODE FROM OUR CURRICULUM!
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    

    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])



In [10]:
w2v = W2vVectorizer(glove)
ohe = OneHotEncoder(drop='first',categories='auto')
ss = StandardScaler()


In [11]:
mapper = DataFrameMapper([
    ('text', w2v),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [12]:
lr = LinearRegression(n_jobs=-1)
rfr = RandomForestRegressor(n_jobs=1, n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor(n_jobs=-1)
regressors = [lr,rfr,gbr,knnr]

In [19]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)is
	:6790.891015102855
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)is
	:6819.854030568219
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                  

In [13]:
y2 = df['log_comments']

In [14]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=2019)

In [22]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y2_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
	is:568.3032034701123
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
	is:553.4577578405839
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                  

In [17]:
tfidf = TfidfVectorizer(max_features=2000)

mapper = DataFrameMapper([
    ('text', w2v),
    ('text', tfidf),       # Adding a second feature extraction on text
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [18]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
	is:14254.068314102651
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
	is:6755.867641844939
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                 

In [19]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y2_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
	is:1023.902251612284
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
	is:537.9441001311949
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                  

In [20]:
tfidf = TfidfVectorizer(max_features=4000)

mapper = DataFrameMapper([
    ('text', w2v),
    ('text', tfidf),       # Adding a second feature extraction on text
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [21]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
	is:45211509309.16554
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
	is:6773.908494301302
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                  