# Loading Data and Packages

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim import corpora
from gensim import models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np
import string
from scipy.sparse import hstack
from scipy.sparse import vstack

# fix for XGBoost errors
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [6]:
import warnings
warnings.filterwarnings("ignore")

First, let's read in the dataset and take a initial look at it.

In [7]:
df_train = pd.read_csv('training_set.csv',
                       encoding = 'latin-1',
                       parse_dates = ['Created'])

df_hold = pd.read_csv('holdout_set.csv',
                      encoding = 'latin-1',
                      parse_dates = ['Created'])

In [8]:
X = df_train.drop('Engagements', axis = 1)
X['data_type'] = "training"
df_hold['data_type'] = "hold"
X = X.append(df_hold.drop('Engagements', axis = 1))

Y = df_train['Engagements']

# Feature Engineering

Next, we will create more features from our dataset mainly to capture time effect/seasonality and also use the text/captions from the posts. 

## Time Series - Month Seasonality with Trend

In [9]:
#doing this to be able to put this into linear regression
X['month'] = X.Created.apply(lambda x: x.month) #seasonal term
X['year_month'] = X.Created.apply(lambda x: x.month + x.year * 12) #trend term

## Time Series - Hourly with day of Week and Trend

In [10]:
#doing this to be able to put this into linear regression
X['hour'] = X.Created.apply(lambda x: x.hour) #seasonal term
X['weekend'] = X.Created.apply(lambda x: int(x.dayofweek >= 5)) #seasonal term
X['weekend_hour_interaction'] = X.Created.apply(lambda x: int(x.dayofweek >= 5) * x.hour) #seasonal term
X['weekday_hour_interaction'] = X.Created.apply(lambda x: int(x.dayofweek < 5) * x.hour) #seasonal term

In [11]:
#adding features
X['day_of_week'] = X.Created.apply(lambda x: x.dayofweek)
X = pd.get_dummies(X, columns = ["day_of_week"])

In [12]:
###just one hot encoding everything (not commented out)
X = pd.get_dummies(X, columns = ['hour', 'weekend', 'weekend_hour_interaction', 'weekday_hour_interaction', 'month'])

## Getting Features from text:

In [13]:
# filling NA with empty text
X.Description.fillna("", inplace = True)

In [14]:
# Initial bag-of-words method feature engineering

X['containsLink'] = X.Description.str.contains('.http').astype(float)
X['exclamationPointCount'] =X.Description.str.count('!').astype(float)
X['questionMarkCount'] = X.Description.str.count('\?').astype(float)
X['doubleQuotationMarkCount'] = X.Description.str.count('\"').astype(float)
X['singleQuoteMarkCount'] = X.Description.str.count('\'').astype(float)
X['commaMarkCount'] = X.Description.str.count(',').astype(float)
X['collinCount'] = X.Description.str.count(':').astype(float)
X['semiCollinCount'] = X.Description.str.count(';').astype(float)
X['percentMarkCount'] = X.Description.str.count('%').astype(float)
X['dollarSignCount'] = X.Description.str.count('$').astype(float)
X['hashCount'] = X.Description.str.count('#').astype(float)
X['starCount'] = X.Description.str.count('\*').astype(float)
X['atCount'] = X.Description.str.count('@').astype(float)
X['percentCapital'] = (X.Description.str.findall(r'[A-Z]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentlowercase'] = (X.Description.str.findall(r'[a-z]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentnumbers'] = (X.Description.str.findall(r'[0-9]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentother'] = (1 - X['percentCapital'] - X['percentlowercase'] - X['percentnumbers']).fillna(0)

In [15]:
# use word2vec from Google News / Twitter 
w = models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)

### (Explain Here)

In [16]:

texts = [[token for token in doc.translate(str.maketrans('', '', string.punctuation)).lower().split()]
               for doc in (X['Description']).astype(str)]

texts_final = []

for i in range(len(texts)):
    doc_final = []
    for j in range(len(texts[i])):
            if texts[i][j] in w:
                doc_final.append(texts[i][j])    
    if len(doc_final) < 1:
        texts_final.append(['NA'])
    else:
        texts_final.append(doc_final)
        
embedding = np.vstack([np.mean(w[doc], axis=0) for doc in texts_final])

for i in range(len(embedding[0])):
    X['embedding_' + str(i)] = embedding[:,i]

In [17]:
#creating dummies for type column
X = pd.get_dummies(X, columns = ["Type"])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X.loc[X.data_type == "training"].drop("data_type", axis = 1),
                                                    Y,
                                                    random_state = 23)


#count vectorizor
vect = TfidfVectorizer()
X_train_sparse = vect.fit_transform(X_train.Description)
X_test_sparse = vect.transform(X_test.Description)

X_train.drop(['Description', "Created"], axis = 1, inplace = True)
X_test.drop(['Description', "Created"], axis = 1, inplace = True)


for feature in X_train.columns:
    X_train_sparse = hstack((X_train_sparse, np.array(X_train[feature]).reshape(-1,1)))
    X_test_sparse = hstack((X_test_sparse, np.array(X_test[feature]).reshape(-1,1)))

## Fitting / Testing Model

For our model selection, we will use MAPE as our scoring system and look over both linear (LR, Lasso, etc.) and non-linear models (RandomForest, XGB). 

In [48]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# custom MAPE scorer for sklearn
def MAPE(y, y_pred, **kwargs):
    return sum(abs((y - y_pred) / y))/len(y)

mape_scorer = make_scorer(MAPE, greater_is_better=True)

In [51]:
models = [LinearRegression(),
          Ridge(),
          Lasso(),
          RandomForestRegressor(),
          XGBRegressor()
         ]

for mdl in models: 
    print(type(mdl).__name__)
    score = cross_val_score(mdl, X_train_sparse, y_train, 
                          n_jobs=-1, scoring=mape_scorer, cv=5)
    
    print("MAPE Scores: ", score)
    print("Mean MAPE: ", np.mean(score))
    print("\n===========================\n")

LinearRegression
MAPE Scores:  [0.14578238 0.13417232 0.12814485 0.1390726  0.14465127]
Mean MAPE:  0.13836468461528803


Ridge
MAPE Scores:  [0.32576615 0.32560614 0.32537882 0.33895547 0.32731153]
Mean MAPE:  0.32860362232089513


Lasso
MAPE Scores:  [0.06699134 0.07185016 0.06820862 0.06919705 0.0717084 ]
Mean MAPE:  0.06959111156982353


RandomForestRegressor
MAPE Scores:  [0.05341027 0.05438591 0.0521782  0.05324786 0.05483808]
Mean MAPE:  0.05361206496980936


XGBRegressor
MAPE Scores:  [0.0513494  0.05132419 0.04435567 0.04977749 0.04999716]
Mean MAPE:  0.04936078119842423




Our best performing model is XGBRegressor, but Lasso does relatively well, which we can use if we need a more interpretable/simpler model. For our case, let's just use XGBRegressor since we care more of predictive power. Let's tune the XGBoost model for our problem. 

Next, lets evaluate our model on our test set.

In [69]:
xgb = XGBRegressor()
model = xgb.fit(X_train_sparse, y_train, verbose=True)
y_pred = xgb.predict(X_test_sparse)
print("MAPE: ", MAPE(y_test, y_pred)*100, "%")

MAPE:  4.876455355354596 %


## Predicting on Holdout Set

With the best model above, we now predict the holdout set for submission.

In [18]:
#refitting tfidf with full data set
X_hold = X.loc[X.data_type == "hold"].drop("data_type", axis = 1)

X_train_total = X.loc[X.data_type == "training"].drop("data_type", axis = 1)

vect = TfidfVectorizer()
X_train_total_sparse = vect.fit_transform(X_train_total.Description)
X_hold_sparse = vect.transform(X_hold.Description)

X_hold.drop(['Description', "Created"], axis = 1, inplace = True)

for feature in X_train.columns:
    X_train_total_sparse = hstack((X_train_total_sparse, np.array(X_train_total[feature]).reshape(-1,1)))
    X_hold_sparse = hstack((X_hold_sparse, np.array(X_hold[feature]).reshape(-1,1)))

#refitting lasso on full data set    
l = Lasso(alpha = 10).fit(X_train_total_sparse, Y)

#double checking this was done correctly
print(sum(abs((Y - l.predict(X_train_total_sparse)) / Y))/len(Y))


df_hold.drop('data_type', axis = 1, inplace = True)
df_hold.Engagements = l.predict(X_hold_sparse)

0.051544797524904894


In [19]:
df_hold.to_csv("holdout_set_Columbia.csv")