# Topic Modelling

## Libraries

In [59]:
import pandas as pd
import numpy as np
import nltk
import re
import gensim
import pyLDAvis.gensim
import warnings
import csv
from datetime import datetime as dt

from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')
# nltk.download()

## Options

In [33]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))       # Changing the cell widths

pd.options.display.max_rows = 30                                            # Setting the max number of rows
pd.options.display.max_columns = 50                                         # Setting the max number of columns

pyLDAvis.enable_notebook()

## Variables

In [34]:
path = '/Users/rebecca/IdeaProjects/DataKindUKSeptember/WeFarm/data/'    # Data Directory 
out = '/Users/rebecca/IdeaProjects/DataKindUKSeptember/WeFarm/data/Outputs/'  # Output Directory
msg = 'messages.csv'                       # Input Dataset

sample_size = 5000
topics = 10

## Import

In [35]:
df_msg_in = pd.read_csv(path + msg)
df_msg_en = df_msg_in[(df_msg_in['language'] == 'EN')] 

## Functions

In [60]:
class Message(object):
    def __init__(self, thread_id, date_time, message_id, user_id, language, msg_type, msg_body):
        self.thread_id = thread_id
        self.date_time = date_time
        self.message_id = message_id
        self.user_id = user_id
        self.language = language
        self.msg_type = msg_type
        self.msg_body = msg_body
        
    
class User(object):
    def __init__(self):
        pass

def loadData(f):
    users = {}
    messages = []
    with open(f, "r") as data:
        # "thread_id","date_time","message_id","user_id","language","type","body"
        # datetime format: 2015-02-09 14:27:05
        reader = csv.DictReader(data)
        for row in reader:
            user_id = row["user_id"]
            message = Message(row["thread_id"],
                              dt.strptime(row["date_time"], "%Y-%m-%d %H:%M:%S"),
                              row["message_id"],
                              row["user_id"],
                              row["language"],
                              row["type"],
                              row["body"])
            if user_id not in users:
                users[user_id] = []
            users[user_id].append(message)
            messages.append(message)
    return users, messages

def getMessageGroups(messages, grouper):
    groupedMessages = {}
    for message in messages:
        messageGroup = getattr(message, grouper)
        if messageGroup not in groupedMessages:
            groupedMessages[messageGroup] = []
        groupedMessages[messageGroup].append(message)
    return groupedMessages
        

def cleaner(row):
    '''Function to clean the text data and prep for further analysis'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = row['body'].lower()                   # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)          # Removes punctuation
    text = re.sub("cyclist","cycl",text)         # Manual intervention for 'cyclist'
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    return text

def model(data):
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    data_dict = corpora.Dictionary(data)                       # Creates an id <-> term dictionary
    data_corpus = [data_dict.doc2bow(text) for text in data]     # convert tokenized documents into a document-term matrix
    data_model = gensim.models.ldamodel.LdaModel(data_corpus, 
                                                   num_topics=topics, 
                                                   id2word = data_dict,
                                                   passes=20)        #  generate LDA model

    data_vis = pyLDAvis.gensim.prepare(data_model, data_corpus, data_dict)        # Visualise LDA Model
    pyLDAvis.save_html(data=data_vis,
                       fileobj=out + 'Data_vis.html')
    data_vis
    return data_model, data_corpus


In [61]:
users, messages = loadData(path + msg)
grouped_messages =  getMessageGroups(messages, 'user_id')
for k, v in grouped_messages.items():
    print(k, len(v))

15169 6
10067 1
34228 20
31519 9
39206 6
55563 2
59953 2
17255 2
37892 1
37083 2
48630 1
31778 2
34313 5
50018 2
21884 9
22537 1
55437 1
49832 6
21005 4
23676 4
27096 12
7448 2
17699 19
2238 3
11692 5
19673 2
23437 1
31829 5
12773 8
23256 8
32073 14
47056 2
15645 7
30733 2
21894 5
20272 3
17457 22
52583 1
29946 1
42163 11
34440 2
41362 19
30758 3
21989 1
20142 6
29970 1
20556 5
1662 41
33953 4
8436 1
27436 1
13208 7
39011 4
43131 5
17347 4
26349 2
18959 1
1875 7
21262 7
11144 4
28462 1
49593 1
25231 1
40561 6
23417 3
11190 5
14638 12
33652 2
56385 2
14986 17
14312 1
42532 1
40177 17
27153 4
31915 5
26263 21
49566 23
8590 1
47218 2
2146 15
57210 10
4546 5
49151 2
14170 1
6635 12
57990 3
33961 2
11878 7
28007 8
54746 1
46006 10
36526 1
38514 5
1840 2
37440 5
3588 24
8559 5
51376 13
41595 1
28817 6
6457 2
39974 16
55924 2
8573 16
47064 4
15257 1
61948 1
1587 4
13351 2
30847 2
4797 2
2960 3
29914 2
43440 1
23714 12
19129 1
6440 3
10095 1
54367 10
20540 4
58722 1
35781 41
58712 1
7797 7
355

In [22]:
df_msg_en['Clean'] = df_msg_en.apply(cleaner,axis=1)

df_msg_en_q = df_msg_en[(df_msg_en['type'] == 'Q')]
df_msg_en_a = df_msg_en[(df_msg_en['type'] == 'A')]

li_msg_en_q = df_msg_en_q['Clean'].tolist()
li_msg_en_a = df_msg_en_a['Clean'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [26]:
sample = li_msg_en_q[:sample_size]
data_model, data_corpus = model(sample)

In [28]:
data_model.get_document_topics(data_corpus[0])

[(0, 0.016670788909897472),
 (1, 0.016666666711536098),
 (2, 0.016666666743358458),
 (3, 0.63423532956555939),
 (4, 0.23242515216228357),
 (5, 0.0166666667307739),
 (6, 0.016666666708361956),
 (7, 0.016666805630248211),
 (8, 0.016666666739049128),
 (9, 0.016668590098931943)]