# Topic Modelling

## Libraries

In [27]:
import pandas as pd
import numpy as np
import nltk
import re
import gensim
#import pyLDAvis.gensim
import warnings
import csv
from datetime import datetime as dt
from sklearn.externals import joblib

from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')
# nltk.download()

## Options

In [28]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))       # Changing the cell widths

pd.options.display.max_rows = 30                                            # Setting the max number of rows
pd.options.display.max_columns = 50                                         # Setting the max number of columns

#pyLDAvis.enable_notebook()

# set random seed
random_seed = 135
state = np.random.RandomState(random_seed)

## Variables

In [29]:
path = '../data/'    # Data Directory 
out = 'Outputs/'  # Output Directory
msg = 'messages.csv'                       # Input Dataset

sample_size = 5000
topics = 10

## Import

In [30]:
df_msg_in = pd.read_csv(path + msg)
df_msg_en = df_msg_in[(df_msg_in['language'] == 'EN')] 

## Functions

In [31]:
class Message(object):
    def __init__(self, thread_id, date_time, message_id, user_id, language, msg_type, msg_body):
        self.thread_id = thread_id
        self.date_time = date_time
        self.message_id = message_id
        self.user_id = user_id
        self.language = language
        self.msg_type = msg_type
        self.msg_body = msg_body
        
    
class User(object):
    def __init__(self):
        pass

def loadData(f):
    users = {}
    messages = []
    with open(f, "r") as data:
        # "thread_id","date_time","message_id","user_id","language","type","body"
        # datetime format: 2015-02-09 14:27:05
        reader = csv.DictReader(data)
        for row in reader:
            user_id = row["user_id"]
            message = Message(row["thread_id"],
                              dt.strptime(row["date_time"], "%Y-%m-%d %H:%M:%S"),
                              row["message_id"],
                              row["user_id"],
                              row["language"],
                              row["type"],
                              row["body"])
            if user_id not in users:
                users[user_id] = []
            users[user_id].append(message)
            messages.append(message)
    return users, messages

def getMessageGroups(messages, grouper):
    groupedMessages = {}
    for message in messages:
        messageGroup = getattr(message, grouper)
        if messageGroup not in groupedMessages:
            groupedMessages[messageGroup] = []
        groupedMessages[messageGroup].append(message)
    return groupedMessages
        

def cleaner(row):
    '''Function to clean the text data and prep for further analysis'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = row['body'].lower()                   # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)          # Removes punctuation
    text = re.sub("cyclist","cycl",text)         # Manual intervention for 'cyclist'
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    return text


def messages_vectorizer(messages):
    '''Function to take a message object and convert it to a list of terms'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = ''
    for m in messages:
        text = text + ' ' + m.msg_body.lower()          # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    return text

def model(data, state):
    data_dict = corpora.Dictionary(data)                       # Creates an id <-> term dictionary
    data_corpus = [data_dict.doc2bow(text) for text in data]     # convert tokenized documents into a document-term matrix
    data_model = gensim.models.ldamodel.LdaModel(data_corpus, 
                                                   num_topics=topics, 
                                                   id2word = data_dict,
                                                   passes=20,
                                                   random_state=state)        #  generate LDA model

    #data_vis = pyLDAvis.gensim.prepare(data_model, data_corpus, data_dict)        # Visualise LDA Model
    #pyLDAvis.save_html(data=data_vis,
    #                    fileobj=out + 'Data_vis.html')
    #data_vis
    return data_model, data_corpus, data_dict


In [32]:
users, messages = loadData(path + msg)
grouped_messages =  getMessageGroups(messages, 'user_id')

In [33]:
grouped_messages.keys()[1]

'58146'

In [34]:
message_count_dict={}

for k,v in grouped_messages.items():
   message_count_dict[k] = len(v) 



import operator
sorted_message_count_dict = sorted(message_count_dict.items(), key=operator.itemgetter(1))
sorted_message_count_dict.reverse()

sorted_message_count_dict

[('33453', 3212),
 ('7759', 536),
 ('28506', 352),
 ('70903', 340),
 ('30628', 313),
 ('4518', 312),
 ('4610', 301),
 ('2027', 297),
 ('2355', 286),
 ('3594', 286),
 ('28502', 259),
 ('7391', 252),
 ('45483', 235),
 ('28455', 234),
 ('4409', 230),
 ('1955', 213),
 ('552', 213),
 ('41656', 202),
 ('33252', 190),
 ('16216', 186),
 ('28706', 181),
 ('4659', 179),
 ('6349', 172),
 ('27335', 170),
 ('6410', 164),
 ('6629', 164),
 ('3230', 163),
 ('3771', 161),
 ('492', 159),
 ('9248', 158),
 ('39397', 143),
 ('3732', 142),
 ('4093', 141),
 ('2871', 138),
 ('30234', 138),
 ('884', 135),
 ('4679', 134),
 ('5679', 131),
 ('4301', 131),
 ('33176', 131),
 ('43353', 128),
 ('5785', 125),
 ('4185', 124),
 ('21115', 123),
 ('42639', 122),
 ('2302', 119),
 ('5451', 118),
 ('28794', 117),
 ('14043', 116),
 ('16813', 116),
 ('3787', 115),
 ('1529', 113),
 ('3448', 112),
 ('2156', 112),
 ('18035', 112),
 ('1291', 112),
 ('8369', 111),
 ('1254', 110),
 ('2120', 109),
 ('4171', 107),
 ('51924', 107),
 ('

In [35]:
grouped_user_data =[]

min_messages = 3
max_messages = 300

grouped_messages_by_user = grouped_messages
for k in grouped_messages.keys():
    v = grouped_messages[k]
    if (len(v)<max_messages and len(v)>min_messages):
        grouped_user_data.append(messages_vectorizer(v))

        


In [36]:
grouped_user_ids=[]
for k in grouped_messages.keys():
    v = grouped_messages[k]
    if (len(v)<max_messages and len(v)>min_messages):
        grouped_user_ids.append(k)

In [37]:
len(grouped_user_ids)

15716

In [39]:
users_topic_model = model(grouped_user_data, state)

In [40]:
the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]

the_model.get_document_topics(the_corpus[0])

joblib.dump(users_topic_model, '../Outputs/users_topic_model.pkl' ) 



['../Outputs/users_topic_model.pkl',
 '../Outputs/users_topic_model.pkl_01.npy',
 '../Outputs/users_topic_model.pkl_02.npy',
 '../Outputs/users_topic_model.pkl_03.npy',
 '../Outputs/users_topic_model.pkl_04.npy',
 '../Outputs/users_topic_model.pkl_05.npy',
 '../Outputs/users_topic_model.pkl_06.npy']

In [41]:

users_topic_model = joblib.load('../Outputs/users_topic_model.pkl')
the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]


In [105]:
the_model.get_topic_terms(0,topn=20)

[(57, 0.025676843918688675),
 (107, 0.015399708681109186),
 (90, 0.014746814499382579),
 (443, 0.010094611292244492),
 (528, 0.008931766023864798),
 (29, 0.0089088351381121419),
 (485, 0.0083548929822330285),
 (768, 0.0083103290436338881),
 (96, 0.0080724065998098241),
 (45, 0.0077221438541055188),
 (8, 0.0073565709265647128),
 (203, 0.0069324333543020087),
 (109, 0.0063034868075286729),
 (13, 0.0060325985617136653),
 (201, 0.0059108217737852166),
 (625, 0.0054885687897984264),
 (121, 0.005394125290598518),
 (468, 0.0052616319549721978),
 (629, 0.0051970202486548841),
 (53, 0.0049493874865910829)]

In [110]:
with open('../Outputs/topics.csv', 'wb') as csvfile:
    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for t in range(10):
        topic_words= ['T'+str(t)]
    
        for k,v in the_model.get_topic_terms(t,topn=25):
            word = word_dict[k]
            topic_words.append(word)
        
        mywriter.writerow(topic_words)
    


In [124]:
with open('../Outputs/users.csv', 'wb') as csvfile:    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for i in range(len(grouped_user_ids)):
        user_id= grouped_user_ids[i]
        scores= the_model.get_document_topics(the_corpus[i])
        scores_dict={}
        for score in scores:
            scores_dict[score[0]]=score[1]
        scores_arr=[]
        for j in range(10):
            if j in scores_dict.keys():
                scores_arr.append(scores_dict[j])
            else:
                scores_arr.append(0.0)
        output_arr=[]
        output_arr.append(user_id)
        for s in scores_arr:
            output_arr.append(str(s))
        mywriter.writerow(output_arr)


In [118]:
len(the_corpus)

15716

In [122]:
the_model.get_document_topics(the_corpus[0])

[(0, 0.18597967244251989),
 (3, 0.44721879252540586),
 (5, 0.18151345139411518),
 (7, 0.096410220768994012),
 (9, 0.078102739582906852)]

In [46]:
df_msg_en['Clean'] = df_msg_en.apply(cleaner,axis=1)

df_msg_en_q = df_msg_en[(df_msg_en['type'] == 'Q')]
df_msg_en_a = df_msg_en[(df_msg_en['type'] == 'A')]

li_msg_en_q = df_msg_en_q['Clean'].tolist()
li_msg_en_a = df_msg_en_a['Clean'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
sample = li_msg_en_q[:sample_size]
data_model, data_corpus = model(sample)

In [None]:
data_model.get_document_topics(data_corpus[0])

In [None]:
# processing a message to make a prediction
users_topic_model = joblib.load('../Outputs/users_topic_model.pkl' ) 

the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]

print(the_corpus[0])
test_message = messages[0]
print(test_message.msg_body.lower())
the_model.get_document_topics(the_corpus[0])
test_message = messages_vectorizer([test_message])
print(test_message)
test_vectorized = word_dict.doc2bow(test_message)

the_model.get_document_topics(test_vectorized)

for k, v in the_model.get_topic_terms(4, topn=25):
    word = word_dict[k]
    print(word)

#for message in messages:
#    the_model.get_document_topics("chicken farming feed")
    


In [None]:
with open('../Outputs/message_predictions.csv', 'wb') as csvfile:    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for message in messages:
        #self.thread_id = thread_id
        #self.date_time = date_time
        #self.message_id = message_id
        #self.user_id = user_id
        #self.language = language
        #self.msg_type = msg_type
        #self.msg_body = msg_body
        
        message_to_predict = messages_vectorizer([message])
        message_to_predict = word_dict.doc2bow(message_to_predict)

        scores = the_model.get_document_topics(message_to_predict)
        
        scores_dict={}
        for score in scores:
            scores_dict[score[0]]=score[1]
        scores_arr=[]
        for j in range(10):
            if j in scores_dict.keys():
                scores_arr.append(scores_dict[j])
            else:
                scores_arr.append(0.0)
        output_arr=[message.thread_id, dt.strftime(message.date_time, "%Y-%m-%d %H:%M:%S"), message.message_id, message.user_id, message.language, message.msg_type, message.msg_body]
        for s in scores_arr:
            output_arr.append(str(s))
        mywriter.writerow(output_arr)