In [157]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_json('prepped_data.json').sort_index()

In [4]:
df.head()

Unnamed: 0,age,num_comments,score,text,timestamp,weekday_posted,hour_posted,log_score,log_comments
0,300,1588,21720,China Killing Prisoners To Harvest Organs For ...,2019-06-19 11:49:08,2,11,9.985989,7.370237
1,240,402,2661,Muslim family dragged out of Belgian embassy i...,2019-06-19 12:05:31,2,12,7.886457,5.996477
2,660,3320,46977,Women outperform men after Japan medical schoo...,2019-06-19 05:51:44,2,5,10.757413,8.107723
3,360,202,1474,MH17 crash: Investigators 'to charge four with...,2019-06-19 10:50:51,2,10,7.295735,5.308317
4,660,1336,2665,Iranian official calls on world to unite again...,2019-06-19 05:09:15,2,5,7.887959,7.197443


In [5]:
X = df[['text','age','weekday_posted','hour_posted']]
y = df['log_score']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)


In [7]:
X_train.head()

Unnamed: 0,text,age,weekday_posted,hour_posted
4867,Donald Trump Jr. on Sunday claimed CNN is cove...,1440,0,4
1395,"Oxfam warns of the ""worst cholera outbreak in ...",1440,5,11
2819,Exposure to weed killing products increases ri...,1440,3,11
567,Hong Kong protesters demand China be held to a...,660,0,0
1444,Austrian Government Seeks to Eliminate Interne...,180,4,10


Extracting Embeddings from GloVe

In [8]:
headers = X_train.text
data = headers.map(word_tokenize).values
total_vocabulary = set(word for headline in data for word in headline)

In [32]:
glove = {}
with open('/Users/patrickfuller/flatiron/glove/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [33]:
# LETS BORROW SOME LOVELY CODE FROM OUR CURRICULUM!
from webapp import vectorizer


In [34]:
from imp import reload
reload(vectorizer)

<module 'webapp.vectorizer' from '/Users/patrickfuller/flatiron/4_mod/mod4_proj/webapp/vectorizer.py'>

In [35]:
w2v = vectorizer.W2vVectorizer(glove)
ohe = OneHotEncoder(drop='first',categories='auto')
ss = StandardScaler()


In [36]:
mapper = DataFrameMapper([
    ('text', w2v),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [14]:
lr = LinearRegression(n_jobs=-1)
rfr = RandomForestRegressor(n_jobs=1, n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor(n_jobs=-1)
regressors = [lr,rfr,gbr,knnr]

In [16]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
	is:6790.891015102855


KeyboardInterrupt: 

### Low model for score prediction seems to be GBT when using GloVe

In [17]:
y2 = df['log_comments']

In [18]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=2019)

In [None]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y2_test),preds)}')

#### Low model for comment prediction seems to be GBT as well when using GloVe

### Could using TFIDF in conjunction with GloVe reduce model error?

In [None]:
tfidf = TfidfVectorizer(max_features=2000)

mapper = DataFrameMapper([
    ('text', w2v),
    ('text', tfidf),       # Adding a second feature extraction on text
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

#### Evaluate GloVe + TFIDF model error for score prediction

In [None]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

#### Evaluate GloVe + TFIDF model error for comment prediction 

In [None]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y2_test),preds)}')

#### A new low of 532: GBT GloVe + TFIDF

## Could adding features to TFIDF reduce model error?

In [None]:
tfidf = TfidfVectorizer(max_features=4000)

mapper = DataFrameMapper([
    ('text', w2v),
    ('text', tfidf),       # Adding a second feature extraction on text
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [None]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

#### Doesn't look like adding features to tfidf helps

### Lets pickle our best performers!

In [284]:
# Score_predictor
log_score_mapper = DataFrameMapper([
    ('text', w2v),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
    ])
log_score_model = Pipeline(steps=[
        ('transform', log_score_mapper),
        ('regressor', gbr)
    ])

# Comment_predictor
tfidf_c = TfidfVectorizer(max_features=2000)
w2v_c = vectorizer.W2vVectorizer(glove)
ohe_c = OneHotEncoder(drop='first',categories='auto')
ss_c = StandardScaler()

gbr_c = GradientBoostingRegressor()

log_comment_mapper = DataFrameMapper([
    ('text', w2v_c),
    ('text', tfidf_c),       # Adding a second feature extraction on text
    (['age'], ss_c),
    (['weekday_posted', 'hour_posted'], ohe_c),
    ])
log_comment_model = Pipeline(steps=[
        ('transform', log_comment_mapper),
        ('regressor', gbr_c)
    ])

In [285]:
log_score_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('transform',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[('text',
                                            W2vVectorizer(w2v={'!': array([-0.58402 ,  0.39031 ,  0.65282 , -0.3403  ,  0.19493 , -0.83489 ,
        0.11929 , -0.57291 , -0.56844 ,  0.72989 , -0.56975 ,  0.53436 ,
       -0.38034 ,  0.22471 ,  0.98031 , -0.2966  ,  0.126   ,  0.55222 ,
       -0.62737 , -0.082242, -0.085359,  0.31515 ,  0.96077 ,  0.31986 ,
        0.87878 , -1.5189  , -...
                                           init=None, learning_rate=0.1,
                                           loss='ls', max_depth=3,
                                           max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                    

In [39]:
log_score_model.predict(X_test[0:1])

array([7.72596028])

In [40]:
log_comment_model.fit(X_train, y2_train)

Pipeline(memory=None,
         steps=[('transform',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[('text',
                                            W2vVectorizer(w2v={'!': array([-0.58402 ,  0.39031 ,  0.65282 , -0.3403  ,  0.19493 , -0.83489 ,
        0.11929 , -0.57291 , -0.56844 ,  0.72989 , -0.56975 ,  0.53436 ,
       -0.38034 ,  0.22471 ,  0.98031 , -0.2966  ,  0.126   ,  0.55222 ,
       -0.62737 , -0.082242, -0.085359,  0.31515 ,  0.96077 ,  0.31986 ,
        0.87878 , -1.5189  , -...
                                           init=None, learning_rate=0.1,
                                           loss='ls', max_depth=3,
                                           max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                    

In [41]:
log_comment_model.predict(X_test[0:1])

array([5.34227749])

In [42]:
X_train.head(1)

Unnamed: 0,text,age,weekday_posted,hour_posted
4867,Donald Trump Jr. on Sunday claimed CNN is cove...,1440,0,4


In [43]:
age = 12
weekday = 1
hour = 18
text = 'Donald Trump Korea'

arguments = pd.DataFrame([[text, age, weekday, hour]], columns=['text', 'age', 'weekday_posted', 'hour_posted'])
arguments

Unnamed: 0,text,age,weekday_posted,hour_posted
0,Donald Trump Korea,12,1,18


In [44]:
log_score_model.predict(arguments[0:1])

array([4.30917486])

In [45]:
log_comment_model.predict(arguments[0:1])

array([3.35623333])

In [286]:
import pickle

In [287]:
with open('log_score_model.pkl', 'wb') as f:
    pickle.dump(log_score_model, f)

In [48]:
with open('log_comment_model.pkl', 'wb') as f:
    pickle.dump(log_comment_model, f)

In [260]:
from webapp import vectorizer

In [261]:
# FunctionTransformer(lambda x: x).fit(X_train.text)     # Function Transformer can not handle text

In [262]:
text_transformer = vectorizer.TextTransformer()
text_transformer.fit(X_train.text, y_test)

TextTransformer()

In [263]:
text_transformer.transform(X_train.text)[:5]

<class 'pandas.core.series.Series'>


array([list(['Donald', 'Trump', 'Jr.', 'on', 'Sunday', 'claimed', 'CNN', 'is', 'covering', 'up', 'for', '``', 'leftist', 'hack', "''", 'Carl', 'Bernstein', ',', 'a', 'veteran', 'journalist', 'known', 'for', 'his', 'coverage', 'of', 'Watergate', ',', 'over', 'his', 'reporting', 'about', 'the', '2016', 'Trump', 'Tower', 'meeting', '.']),
       list(['Oxfam', 'warns', 'of', 'the', '``', 'worst', 'cholera', 'outbreak', 'in', 'the', 'world', "''", 'in', 'Yemen']),
       list(['Exposure', 'to', 'weed', 'killing', 'products', 'increases', 'risk', 'of', 'cancer', 'by', '41', '%', ',', 'finds', 'a', 'new', 'study', 'that', 'provides', 'evidence', 'that', '‘', 'supports', 'link', '’', 'between', 'exposures', 'to', 'glyphosate', 'herbicides', 'and', 'increased', 'risk', 'for', 'non-Hodgkin', 'lymphoma', '.']),
       list(['Hong', 'Kong', 'protesters', 'demand', 'China', 'be', 'held', 'to', 'account', 'for', '1989', 'Tiananmen', 'Massacre', '.']),
       list(['Austrian', 'Government', 'Seeks',

In [272]:
gbr = GradientBoostingRegressor(n_estimators=40)
text_transformer = vectorizer.TextTransformer()
w2v = vectorizer.W2vVectorizer(glove)
text_pipe = Pipeline(steps=[
    ('tok', text_transformer),
    ('w2v', w2v)
])

log_score_mapper = DataFrameMapper([
    ('text', text_pipe),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
    ])
log_score_model = Pipeline(steps=[
        ('transform', log_score_mapper),
        ('regressor', gbr)
    ])

In [267]:
log_score_model.fit(X_train, y_train)

<class 'numpy.ndarray'>


Pipeline(memory=None,
         steps=[('transform',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[('text',
                                            Pipeline(memory=None,
                                                     steps=[('tok',
                                                             TextTransformer()),
                                                            ('w2v',
                                                             W2vVectorizer(w2v={'!': array([-0.58402 ,  0.39031 ,  0.65282 , -0.3403  ,  0.19493 , -0.83489 ,
        0.11929 , -0.57291 , -0.56844 ,  0.72989 , -0.56975 ,  0.53436 ,
       -0.38034 ,  0.22471 ,  0.98031 , -0.2966  ,  0.126   ,  0.55222 ,
       -0.62737 ,...
                                           init=None, learning_rate=0.1,
                                           loss='ls', max_depth=3,
                                           max_features=None,
                        

In [271]:
preds = np.exp(log_score_model.predict(X_test))
print(f'The MAE of the {str(regressor)}'
        f'\n\tis:{mean_absolute_error(np.exp(y_test),preds)}')

<class 'numpy.ndarray'>
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)
	is:6678.364767414466


##  Didn't do much better hmm
### maybe on comments?

In [282]:
# Comment_predictor
tfidf_c = TfidfVectorizer(max_features=2000)
w2v_c = vectorizer.W2vVectorizer(glove)
ohe_c = OneHotEncoder(drop='first',categories='auto')
ss_c = StandardScaler()

text_transformer_c = vectorizer.TextTransformer()
text_pipe_c = Pipeline(steps=[
    ('tok', text_transformer_c),
    ('w2v', w2v_c)
])

gbr_c = GradientBoostingRegressor(n_estimators=30)

log_comment_mapper = DataFrameMapper([
    ('text', text_pipe_c),
    ('text', tfidf_c),       # Adding a second feature extraction on text
    (['age'], ss_c),
    (['weekday_posted', 'hour_posted'], ohe_c),
    ])
log_comment_model = Pipeline(steps=[
        ('transform', log_comment_mapper),
        ('regressor', gbr_c)
    ])

In [283]:
log_comment_model.fit(X_train, y2_train)

<class 'numpy.ndarray'>


Pipeline(memory=None,
         steps=[('transform',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[('text',
                                            Pipeline(memory=None,
                                                     steps=[('tok',
                                                             TextTransformer()),
                                                            ('w2v',
                                                             W2vVectorizer(w2v={'!': array([-0.58402 ,  0.39031 ,  0.65282 , -0.3403  ,  0.19493 , -0.83489 ,
        0.11929 , -0.57291 , -0.56844 ,  0.72989 , -0.56975 ,  0.53436 ,
       -0.38034 ,  0.22471 ,  0.98031 , -0.2966  ,  0.126   ,  0.55222 ,
       -0.62737 ,...
                                           init=None, learning_rate=0.1,
                                           loss='ls', max_depth=3,
                                           max_features=None,
                        

In [278]:
tfidf_c.fit_transform(X_train.text)

<3631x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 48962 stored elements in Compressed Sparse Row format>