In [None]:
# Import necessary packages:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np 
import re
import pandas as pd
import csv
import datetime
import pytz
from textblob import TextBlob
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
nltk.download('wordnet')

import gensim

import guidedlda

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.simplefilter("ignore")
warnings.warn("deprecated", DeprecationWarning)
warnings.simplefilter("ignore")

## Data Preprocessing and Cleaning

In [None]:
# this cell is for pre-processing and cleaning of the data

# Loading the dataset as a dataframe (replace "newdf.csv" with name of your respective csv file):
df = pd.read_csv("newdf.csv")

# Cleaning and processing the dataframe: 

# Renaming Headers:
df = df.rename(columns={'Subject Line': 'subject' , 'Sender': 'from' , 'Date': 'date'})

# Converting date to datetime format and validating: 
df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, errors='coerce', utc=True))
df = df[df['date'].notna()]
df.loc[:, 'date'] = pd.to_datetime(df.loc[:, 'date'], format="%Y%m%d:%H:%M:%S.%f")
time_pattern ='[0-9]{2}'
df['time']=[x.strftime("%H:%M:%S") for x in df.loc[:,'date']]
df['hour']=[re.findall(time_pattern,x)[0] for x in df.loc[:,"time"]]
df['month'] = pd.DatetimeIndex(df['date']).month

# Removing Export Error:
def remove_subject(regex, data):
    boolean_list= data.subject.str.match(regex)
    keep_index=list(boolean_list.where(boolean_list==False).dropna().index)
    data = data.loc[keep_index,:]
    return data
error_regex = '\#ERROR\!'
without_e = remove_subject(error_regex, df)
df = without_e
UTF_pattern ='.*(UTF).*'
utf_pattern ='.*(utf).*'
df = remove_subject(UTF_pattern, df)
df = remove_subject(utf_pattern, df)

# Eliminating duplicate subject lines:
df = df.drop_duplicates(subset=['subject'])

# Accumulating links and subjects:
df['links'] = [re.findall(r'^https?:\/\/.*[\r\n]*', i) for i in df['subject']]
df['subject'] = df['subject'].replace(to_replace=r'^https?:\/\/.*[\r\n]*',value='',regex=True)

# Removing user tags:
df['subject'] = df['subject'].replace(to_replace=r'/(@\S*)/',value='',regex=True)

#Remove hashtags:
df['subject'] = df['subject'].replace(to_replace=r'/(#\S*)/',value='',regex=True)

#Remove numbers:
# df['subject'] = df['subject'].replace(to_replace=r'\d+', value='', regex = True)

# Determining polarity and subjectivity: 
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
df['polarity'] = df['subject'].apply(pol)
df['subjectivity'] = df['subject'].apply(sub)

# Display purposes: 
df.head(5)

In [None]:
# Pre-processing by account for English stopwords and terms with no definition/emotional value
stop_words = list(stopwords.words("english"))

In [None]:
# Tokenizing words based on GloVe:
tokenizer = Tokenizer(num_words=5000) 
tokenizer.fit_on_texts(df.subject.values)
words_to_index = tokenizer.word_index

# Padding sequences:
sequences = tokenizer.texts_to_sequences(df.subject.values)
X = pad_sequences(sequences, padding='post', maxlen=20)

# Display purposes:
print(X.shape) 

In [None]:
# Using glove vectorizer: 
token_vectorizer = CountVectorizer(tokenizer, stop_words=stop_words, ngram_range=(1, 4)) 
X = token_vectorizer.fit_transform(df.subject)

In [None]:
# Data validation purposes:
X[0,:]

In [None]:
# Creating dictionary of subject line terms from vector:
tf_feature_names = token_vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))

In [None]:
# Printing words/phrases from generated dictionary:
list(word2id.items())[:10]

## GuidedLDA Modeling

In [None]:
# Selecting unique and key distinguishing words as seed words for guidedLDA seeding: 

enthusiasm = ['great', 'happy', 'big', 'cheer', 'cool', 'celebrate', 'bright', 'award', 'awesome', 'beautiful', 
              'beauty', 'best', 'birthday', 'boost', 'amaze', 'anniversary', 'celebration', 'delicious', 'enjoy',
             'excite', 'favorite', 'fun', 'game', 'games', 'happy', 'huge', 'joy', 'love', 'massive', 'mega', 'open', 
              'opening','sale', 'save big', 'say hello', 'sale start', 'shop', 'shopping', 'super', 'super sale', 
              'spring','summer', 'summer sale', 'gift', 'awarded', 'grand', 'smile', 'vacation', 'awesome', 
              'gifts', 'spring', 'swag', 'rewards', 'magic', 'shipped', 'breakthrough', 'motivate', 'goals',
             'inspiration', 'travel', 'confidence', 'cute', 'adore']

urgency = ['miss', 'time', 'action', 'apply', 'attention', 'chance', 'act fast', 'almost', 'clock', 'close',
          'countdown', 'date', 'day', 'deadline', 'due', 'end', 'ends', 'expire', 'expires', 'final', 'finish', 'go',
          'hour', 'hr', 'hurry', 'last', 'late', 'left', 'must', 'one day', 'require', 'sale end','sale last', 
           'save','soon', 'still', 'still time', 'soon', 'speed', 'time', 'today','tomorrow', 'get', 'ahead', 
          'verify', 'gone', 'openings', 'waiting', 'now']

surprise = ['surprise', 'alert', 'early', 'fast', 'faster', 'finally', 'flash', 'forget', 'gift',
           'heard', 'invitation', 'invite', 'launch', 'look', 'brand new', 'sale', 'surprise', 'unlimited', 
            'unlimited access', 'upcoming', 'update', 'upgrade', 'drop', 'announcement', 'venture', 'invited',
           'bang']

trust = ['confirm', 'advice', 'ask', 'comfort', 'comfy', 'contact', 'cozy', 'control', 'daily', 'annual',
        'everyone', 'everyday', 'expert', 'fact', 'family', 'home', 'information', 'info', 'instructor',
        'law', 'local', 'match', 'message','secures','security','scholarship', 'school', 'science', 
         'scientist', 'stats', 'subscription', 'summary', 'support', 'technology', 'thank', 'community', 'reasons', 
        'true', 'recommend', 'data', 'control', 'understanding', 'science', 'popular', 'assistant', 'guide',
        'well', 'mental', 'therapy', 'with']

curiosity = ['new', 'try', 'already', 'awaits', 'brand new', 'challenge', 'click', 'complete', 'activate',
            'different', 'easy', 'enter', 'explore', 'find', 'help', 'idea', 'important', 'inside', 'learn',
            'listen', 'mystery', 'opportunity', 'sale item', 'see', 'see new', 'see new post', 'skill', 'start',
            'start new', 'start today', 'qualify', 'surprising', 'chance', 'added', 'release', 'releases', 'update',
            'updates', 'arrived', 'celeb', 'celebrity', 'affair', 'affordable', 'introduce', 'introduces', 
            'anomaly', 'hot', 'look']

greed = ['free', 'save', 'almost', 'benefit', 'budget', 'claim', 'could win', 'beyond', 'certificate', 'demand',
        'double', 'extra', 'future', 'limited', 'loss', 'lose', 'max', 'money', 'one get','one get one', 'saving',
        'stock', 'win', 'winner', 'bonus', 'points', 'free', 'million', 'billion', 'skills', 'skill', 'tools', 
        'strategy', 'strategies']

exclusivity = ['special', 'available', 'bonus', 'come back', 'choose', 'choice', 'deserve', 'exclusive', 'honor',
              'individual', 'meet', 'member', 'mood', 'offer', 'perfect','personal', 'premiere', 'premium', 
               'recommend', 'recommendation', 'recommends', 'secret', 'special offer', 'specialization',  
               'select', 'self', 'together', 'vip', 'welcome', 'join', 'reservation', 'waiver', 'appreciate', 
              'appreciation', 'first']

# Ensuring all words from the riginal list are in the word2id list: 
enthusiasm = [x for x in enthusiasm if x in list(word2id.keys())]
urgency = [x for x in urgency if x in list(word2id.keys())]
surprise = [x for x in surprise if x in list(word2id.keys())]
trust = [x for x in trust if x in list(word2id.keys())]
curiosity = [x for x in curiosity if x in list(word2id.keys())]
greed = [x for x in greed if x in list(word2id.keys())]
exclusivity = [x for x in exclusivity if x in list(word2id.keys())]

# Creating an array of topics (targeted marketing emotions) with all relevent words:
seed_topic_list = [enthusiasm, urgency, surprise, trust, curiosity, exclusivity]
topics = ['enthusiasm', 'urgency', 'surprise', 'trust', 'curiosity', 'greed', 'exclusivity', 'other']

In [None]:
# Initiating guidedLDA Model
model = guidedlda.GuidedLDA(n_topics=8, n_iter=100, random_state=7, refresh=10)
seed_topics = {} 
for t_id, st in enumerate(seed_topic_list): 
    for word in st: 
        seed_topics[word2id[word]] = t_id 
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15) 

In [None]:
# setting up dataframe to list the top n words for each emotion category
n_top_words = 15
topic_word = model.topic_word_
df_top_words = pd.DataFrame()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(tf_feature_names)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    df_top_words[topics[i]] = topic_words

In [None]:
# display dataframe
df_top_words

In [None]:
# create numeric matrix containing emotional intensities for each subject line
int_values = model.transform(X)
int_values

In [None]:
# create a dataframe for int_values
topics = ['enthusiasm', 'urgency', 'surprise', 'trust', 'curiosity', 'greed', 'exclusivity', 'other']
df_topic_int = pd.DataFrame(int_values, columns = topics)
df_topic_int

In [None]:
# creating a sum column to make sure all probabilities add up to 1
dominant_topic_num = np.argmax(df_topic_int.values, axis=1)
dominant_topic_lab = [topics[i] for i in dominant_topic_num]
df_topic_int['dominant topic'] = dominant_topic_lab
df_topic_int['sum'] = df_topic_int['greed'] + df_topic_int['other'] + df_topic_int['enthusiasm'] + df_topic_int['urgency'] + df_topic_int['surprise'] + df_topic_int['trust'] + df_topic_int['curiosity']  + df_topic_int['exclusivity'] 

In [None]:
# values that weren't adding up to 1 added up to 0.08 so checking if any values do exist
df_topic_int.loc[df_topic_int['sum'] == 0.08]

In [None]:
# inserting subject lines into the dataframe 
subject_lines = df['subject'].values
df_topic_int['subject lines'] = subject_lines

In [None]:
pd.set_option("display.max_colwidth", -1)

In [None]:
# dislaying the results
df_topic_int.head(10)

In [None]:
# Generating Topic-Keyword Matrix:
df_topic_keywords = pd.DataFrame(model.components_)

# Assigning Column and Index:
df_topic_keywords.columns = token_vectorizer.get_feature_names()
df_topic_keywords.index = topics

# View:
df_topic_keywords = df_topic_keywords.transpose()
df_topic_keywords