# Loading Data and Packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim import corpora
from gensim import models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np
import string
from scipy.sparse import hstack
from scipy.sparse import vstack
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_train = pd.read_csv('training_set.csv',
                       encoding = 'latin-1',
                       parse_dates = ['Created'])

df_hold = pd.read_csv('holdout_set.csv',
                      encoding = 'latin-1',
                      parse_dates = ['Created'])

In [4]:
X = df_train.drop('Engagements', axis = 1)
X['data_type'] = "training"
df_hold['data_type'] = "hold"
X = X.append(df_hold.drop('Engagements', axis = 1))

Y = df_train['Engagements']

# Creating Features

## Time Series - Month Seasonality with Trend

In [5]:
#doing this to be able to put this into linear regression
X['month'] = X.Created.apply(lambda x: x.month) #seasonal term
X['year_month'] = X.Created.apply(lambda x: x.month + x.year * 12) #trend term

## Time Series - Hourly with day of Week and Trend

In [6]:
#doing this to be able to put this into linear regression
X['hour'] = X.Created.apply(lambda x: x.hour) #seasonal term
X['weekend'] = X.Created.apply(lambda x: int(x.dayofweek >= 5)) #seasonal term
X['weekend_hour_interaction'] = X.Created.apply(lambda x: int(x.dayofweek >= 5) * x.hour) #seasonal term
X['weekday_hour_interaction'] = X.Created.apply(lambda x: int(x.dayofweek < 5) * x.hour) #seasonal term


In [7]:
#adding features
X['day_of_week'] = X.Created.apply(lambda x: x.dayofweek)
X = pd.get_dummies(X, columns = ["day_of_week"])

In [8]:
###just one hot encoding everything (not commented out)
X = pd.get_dummies(X, columns = ['hour', 'weekend', 'weekend_hour_interaction', 'weekday_hour_interaction', 'month'])

## Getting Features from text:

In [9]:
#filling NA with empty text
X.Description.fillna("", inplace = True)

In [10]:
#copied from AML, feel free to add your AML stuff

X['containsLink'] = X.Description.str.contains('.http').astype(float)
X['exclamationPointCount'] =X.Description.str.count('!').astype(float)
X['questionMarkCount'] = X.Description.str.count('\?').astype(float)
X['doubleQuotationMarkCount'] = X.Description.str.count('\"').astype(float)
X['singleQuoteMarkCount'] = X.Description.str.count('\'').astype(float)
X['commaMarkCount'] = X.Description.str.count(',').astype(float)
X['collinCount'] = X.Description.str.count(':').astype(float)
X['semiCollinCount'] = X.Description.str.count(';').astype(float)
X['percentMarkCount'] = X.Description.str.count('%').astype(float)
X['dollarSignCount'] = X.Description.str.count('$').astype(float)
X['hashCount'] = X.Description.str.count('#').astype(float)
X['starCount'] = X.Description.str.count('\*').astype(float)
X['atCount'] = X.Description.str.count('@').astype(float)
X['percentCapital'] = (X.Description.str.findall(r'[A-Z]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentlowercase'] = (X.Description.str.findall(r'[a-z]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentnumbers'] = (X.Description.str.findall(r'[0-9]').str.len().fillna(0)/X.Description.str.len().fillna(1)).fillna(0)
X['percentother'] = (1 - X['percentCapital'] - X['percentlowercase'] - X['percentnumbers']).fillna(0)

In [11]:
#loading data from Google

#perhaps use one better for twitter data
w = models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)

In [12]:
texts = [[token for token in doc.translate(str.maketrans('', '', string.punctuation)).lower().split()]
               for doc in (X['Description']).astype(str)]

texts_final = []

for i in range(len(texts)):
    doc_final = []
    for j in range(len(texts[i])):
            if texts[i][j] in w:
                doc_final.append(texts[i][j])    
    if len(doc_final) < 1:
        texts_final.append(['NA'])
    else:
        texts_final.append(doc_final)
        
embedding = np.vstack([np.mean(w[doc], axis=0) for doc in texts_final])

for i in range(len(embedding[0])):
    X['embedding_' + str(i)] = embedding[:,i]

In [13]:
#creating dummies for type column
X = pd.get_dummies(X, columns = ["Type"])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X.loc[X.data_type == "training"].drop("data_type", axis = 1),
                                                    Y,
                                                    random_state = 23)


#count vectorizor
vect = TfidfVectorizer()
X_train_sparse = vect.fit_transform(X_train.Description)
X_test_sparse = vect.transform(X_test.Description)

X_train.drop(['Description', "Created"], axis = 1, inplace = True)
X_test.drop(['Description', "Created"], axis = 1, inplace = True)


for feature in X_train.columns:
    X_train_sparse = hstack((X_train_sparse, np.array(X_train[feature]).reshape(-1,1)))
    X_test_sparse = hstack((X_test_sparse, np.array(X_test[feature]).reshape(-1,1)))

## Fitting / Testing Model

In [15]:
lr = LinearRegression().fit(X_train_sparse, y_train)
l = Lasso().fit(X_train_sparse, y_train)
r = Ridge().fit(X_train_sparse, y_train)
en = ElasticNet().fit(X_train_sparse, y_train)

In [16]:
MAPE_lr = sum(abs((y_test - lr.predict(X_test_sparse)) / y_test))/len(y_test)
MAPE_l = sum(abs((y_test - l.predict(X_test_sparse)) / y_test))/len(y_test)
MAPE_r = sum(abs((y_test - r.predict(X_test_sparse)) / y_test))/len(y_test)
MAPE_en = sum(abs((y_test - en.predict(X_test_sparse)) / y_test))/len(y_test)
print(MAPE_lr)
print(MAPE_l)
print(MAPE_r)
print(MAPE_en)

#lasso gives best score

0.1401443974547446
0.06669218817417247
0.31705360523026477
0.19723597265100756


In [17]:
#cant find grid search for MAPE, doing it manually:

for vals in [0.001, 0.01, 0.1, 1, 10, 100]:
    Lasso_grid = Lasso(alpha = vals).fit(X_train_sparse, y_train)
    print(str(vals) + ":")
    print(sum(abs((y_test - Lasso_grid.predict(X_test_sparse)) / y_test))/len(y_test))
    print('\n')

0.001:
0.10662748189101248


0.01:
0.10463546937509147


0.1:
0.08964899447555166


1:
0.06669218817417247


10:
0.05746811417025286


100:
0.06245284724795054




## Predicting on Holdout Set

In [18]:
#refitting tfidf with full data set
X_hold = X.loc[X.data_type == "hold"].drop("data_type", axis = 1)

X_train_total = X.loc[X.data_type == "training"].drop("data_type", axis = 1)

vect = TfidfVectorizer()
X_train_total_sparse = vect.fit_transform(X_train_total.Description)
X_hold_sparse = vect.transform(X_hold.Description)

X_hold.drop(['Description', "Created"], axis = 1, inplace = True)

for feature in X_train.columns:
    X_train_total_sparse = hstack((X_train_total_sparse, np.array(X_train_total[feature]).reshape(-1,1)))
    X_hold_sparse = hstack((X_hold_sparse, np.array(X_hold[feature]).reshape(-1,1)))

#refitting lasso on full data set    
l = Lasso(alpha = 10).fit(X_train_total_sparse, Y)

#double checking this was done correctly
print(sum(abs((Y - l.predict(X_train_total_sparse)) / Y))/len(Y))


df_hold.drop('data_type', axis = 1, inplace = True)
df_hold.Engagements = l.predict(X_hold_sparse)

0.051544797524904894


In [19]:
df_hold.to_csv("holdout_set_Columbia.csv")