# Topic Modelling

## Libraries

In [149]:
import pandas as pd
import numpy as np
import nltk
import re
import gensim
#import pyLDAvis.gensim
import warnings
import csv
from datetime import datetime as dt
from sklearn.externals import joblib

from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')
# nltk.download()

## Options

In [150]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))       # Changing the cell widths

pd.options.display.max_rows = 30                                            # Setting the max number of rows
pd.options.display.max_columns = 50                                         # Setting the max number of columns

#pyLDAvis.enable_notebook()

# set random seed
random_seed = 135
state = np.random.RandomState(random_seed)

## Variables

In [151]:
path = '../data/'    # Data Directory 
out = 'Outputs/'  # Output Directory
msg = 'messages.csv'                       # Input Dataset

sample_size = 5000
topics = 10

## Import

In [152]:
df_msg_in = pd.read_csv(path + msg)
df_msg_en = df_msg_in[(df_msg_in['language'] == 'EN')] 

## Functions

In [153]:
class Message(object):
    def __init__(self, thread_id, date_time, message_id, user_id, language, msg_type, msg_body):
        self.thread_id = thread_id
        self.date_time = date_time
        self.message_id = message_id
        self.user_id = user_id
        self.language = language
        self.msg_type = msg_type
        self.msg_body = msg_body
        
    
class User(object):
    def __init__(self):
        pass

def loadData(f):
    users = {}
    messages = []
    with open(f, "r") as data:
        # "thread_id","date_time","message_id","user_id","language","type","body"
        # datetime format: 2015-02-09 14:27:05
        reader = csv.DictReader(data)
        for row in reader:
            user_id = row["user_id"]
            message = Message(row["thread_id"],
                              dt.strptime(row["date_time"], "%Y-%m-%d %H:%M:%S"),
                              row["message_id"],
                              row["user_id"],
                              row["language"],
                              row["type"],
                              row["body"])
            if user_id not in users:
                users[user_id] = []
            users[user_id].append(message)
            messages.append(message)
    return users, messages

def getMessageGroups(messages, grouper):
    groupedMessages = {}
    for message in messages:
        messageGroup = getattr(message, grouper)
        if messageGroup not in groupedMessages:
            groupedMessages[messageGroup] = []
        groupedMessages[messageGroup].append(message)
    return groupedMessages
        

def cleaner(row):
    '''Function to clean the text data and prep for further analysis'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = row['body'].lower()                   # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)          # Removes punctuation
    text = re.sub("cyclist","cycl",text)         # Manual intervention for 'cyclist'
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    return text


def messages_vectorizer(messages):
    '''Function to take a message object and convert it to a list of terms'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = ''
    for m in messages:
        text = text + ' ' + m.msg_body.lower()          # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    return text

def model(data, state):
    data_dict = corpora.Dictionary(data)                       # Creates an id <-> term dictionary
    data_corpus = [data_dict.doc2bow(text) for text in data]     # convert tokenized documents into a document-term matrix
    data_model = gensim.models.ldamodel.LdaModel(data_corpus, 
                                                   num_topics=topics, 
                                                   id2word = data_dict,
                                                   passes=20,
                                                   random_state=state)        #  generate LDA model

    #data_vis = pyLDAvis.gensim.prepare(data_model, data_corpus, data_dict)        # Visualise LDA Model
    #pyLDAvis.save_html(data=data_vis,
    #                    fileobj=out + 'Data_vis.html')
    #data_vis
    return data_model, data_corpus, data_dict


In [154]:
users, messages = loadData(path + msg)
grouped_messages =  getMessageGroups(messages, 'user_id')

[[<__main__.Message at 0x1535f7950>, <__main__.Message at 0x1535f7b10>],
 [<__main__.Message at 0x1540e0150>,
  <__main__.Message at 0x152333990>,
  <__main__.Message at 0x1523339d0>,
  <__main__.Message at 0x16524e110>,
  <__main__.Message at 0x1654add50>,
  <__main__.Message at 0x1654c1fd0>,
  <__main__.Message at 0x1654fc890>,
  <__main__.Message at 0x165510650>,
  <__main__.Message at 0x165510910>,
  <__main__.Message at 0x165510b10>,
  <__main__.Message at 0x167b8bbd0>,
  <__main__.Message at 0x187a107d0>,
  <__main__.Message at 0x187a19610>,
  <__main__.Message at 0x187a19690>,
  <__main__.Message at 0x187a19710>,
  <__main__.Message at 0x187a19750>,
  <__main__.Message at 0x1890aeb10>,
  <__main__.Message at 0x167d63b10>,
  <__main__.Message at 0x167d63c10>,
  <__main__.Message at 0x167d63d50>,
  <__main__.Message at 0x167d75590>,
  <__main__.Message at 0x167d75650>,
  <__main__.Message at 0x167d75f10>,
  <__main__.Message at 0x167dac790>,
  <__main__.Message at 0x167dac810>,
  

In [156]:
message_count_dict={}

for k,v in grouped_messages.items():
   message_count_dict[k] = len(v) 



import operator
sorted_message_count_dict = sorted(message_count_dict.items(), key=operator.itemgetter(1))
sorted_message_count_dict.reverse()

sorted_message_count_dict

[('33453', 3212),
 ('7759', 536),
 ('28506', 352),
 ('70903', 340),
 ('30628', 313),
 ('4518', 312),
 ('4610', 301),
 ('2027', 297),
 ('2355', 286),
 ('3594', 286),
 ('28502', 259),
 ('7391', 252),
 ('45483', 235),
 ('28455', 234),
 ('4409', 230),
 ('1955', 213),
 ('552', 213),
 ('41656', 202),
 ('33252', 190),
 ('16216', 186),
 ('28706', 181),
 ('4659', 179),
 ('6349', 172),
 ('27335', 170),
 ('6410', 164),
 ('6629', 164),
 ('3230', 163),
 ('3771', 161),
 ('492', 159),
 ('9248', 158),
 ('39397', 143),
 ('3732', 142),
 ('4093', 141),
 ('2871', 138),
 ('30234', 138),
 ('884', 135),
 ('4679', 134),
 ('5679', 131),
 ('4301', 131),
 ('33176', 131),
 ('43353', 128),
 ('5785', 125),
 ('4185', 124),
 ('21115', 123),
 ('42639', 122),
 ('2302', 119),
 ('5451', 118),
 ('28794', 117),
 ('14043', 116),
 ('16813', 116),
 ('3787', 115),
 ('1529', 113),
 ('3448', 112),
 ('2156', 112),
 ('18035', 112),
 ('1291', 112),
 ('8369', 111),
 ('1254', 110),
 ('2120', 109),
 ('4171', 107),
 ('51924', 107),
 ('

In [157]:
grouped_user_data =[]

min_messages = 3
max_messages = 300

grouped_messages_by_user = grouped_messages
for k in grouped_messages.keys():
    v = grouped_messages[k]
    if (len(v)<max_messages and len(v)>min_messages):
        grouped_user_data.append(messages_vectorizer(v))

        


In [158]:
grouped_user_ids=[]
for k in grouped_messages.keys():
    v = grouped_messages[k]
    if (len(v)<max_messages and len(v)>min_messages):
        grouped_user_ids.append(k)

In [159]:
len(grouped_user_ids)

15716

In [18]:
users_topic_model = model(grouped_user_data, state)

In [21]:
the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]

the_model.get_document_topics(the_corpus[0])

joblib.dump(users_topic_model, '../Outputs/users_topic_model.pkl' ) 



['../Outputs/users_topic_model.pkl',
 '../Outputs/users_topic_model.pkl_01.npy',
 '../Outputs/users_topic_model.pkl_02.npy',
 '../Outputs/users_topic_model.pkl_03.npy',
 '../Outputs/users_topic_model.pkl_04.npy',
 '../Outputs/users_topic_model.pkl_05.npy',
 '../Outputs/users_topic_model.pkl_06.npy']

In [160]:

users_topic_model = joblib.load('../Outputs/users_topic_model.pkl')
the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]


In [161]:
the_model.get_topic_terms(0,topn=20)

[(109, 0.020517455092605674),
 (187, 0.01981773283974796),
 (61, 0.019489766055935095),
 (568, 0.01606127018240433),
 (92, 0.015368264130438403),
 (10, 0.014297955965357665),
 (489, 0.01364294644706146),
 (564, 0.013042937833213124),
 (1, 0.011099333724387749),
 (226, 0.010820532632083234),
 (94, 0.010497357771508313),
 (1131, 0.010483115217618624),
 (441, 0.010224872472235779),
 (23, 0.0094180934385024086),
 (1977, 0.00865783117210496),
 (4, 0.0084032962603300693),
 (562, 0.0083955589890519419),
 (47, 0.0082115215449398739),
 (981, 0.0075506410878323979),
 (63, 0.0074509346807565989)]

In [162]:
with open('../Outputs/topics.csv', 'wb') as csvfile:
    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for t in range(10):
        topic_words= ['T'+str(t)]
    
        for k,v in the_model.get_topic_terms(t,topn=25):
            word = word_dict[k]
            topic_words.append(word)
        
        mywriter.writerow(topic_words)
    


In [163]:
topics_to_users={}
for j in range(10):
    topics_to_users[j]={}

In [164]:
with open('../Outputs/users.csv', 'wb') as csvfile:    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for i in range(len(grouped_user_ids)):
        user_id= grouped_user_ids[i]
        scores= the_model.get_document_topics(the_corpus[i])
        scores_dict={}
        for score in scores:
            scores_dict[score[0]]=score[1]
        scores_arr=[]
        for j in range(10):
            if j in scores_dict.keys():
                scores_arr.append(scores_dict[j])
                topics_to_users[j][user_id]=scores_dict[j]                
            else:
                scores_arr.append(0.0)
                topics_to_users[j][user_id]=0.0
        output_arr=[]
        output_arr.append(user_id)
        for s in scores_arr:
            output_arr.append(str(s))
            
        mywriter.writerow(output_arr)


In [165]:
def top_users_for_topic(topics_to_users, t, max_val=50):
    scores = topics_to_users[t]
    sorted_scores= sorted(scores.items(), key=operator.itemgetter(1))
    sorted_scores.reverse()
    return sorted_scores[:max_val]
    
def topic_scores_for_message(message):
    message_to_predict = messages_vectorizer([message])
    message_to_predict = word_dict.doc2bow(message_to_predict)
    scores = the_model.get_document_topics(message_to_predict)
    return scores

def top_users_for_message(message, user_scores):
    message_scores = topic_scores_for_message(message)
    topic = max(message_scores, key=lambda item:item[1])[0]
    top_users = [x for (x, y) in top_users_for_topic(user_scores, topic)]
    return topic, top_users

In [190]:
def questions_to_recommended_users():
    for message in messages[:100]:
        if(message.msg_type=='Q'):
            topic, users = top_users_for_message(message, topics_to_users)
            print 'Question: '
            print message.msg_body, topic
            
            print 'recommended users: '
            shuffled = random.shuffle(users)
            for u in users[:10]:             
                users_answers = filter (lambda x: x.msg_type=='A',grouped_messages[u])
                if(len(users_answers)==0): 
                    continue
                print 'User: ', u
                for a in users_answers: 
                    print a.msg_body
            print

In [191]:
questions_to_recommended_users()

Question: 
when is the right time to prune my tea bushes? 1
recommended users: 
User:  36995
ITS FINE
yeah they grow faster
Maxiwa
Mriko
User:  47952
DUMA 43
User:  28200
use pesticide like;_Actara,karate,dynamec,and polytrin
User:  39984
PREPARE LAND FIRST
IS POSSIBLE WHAT MATTERS IS FERTILITY IN THE SOIL
User:  55332
its a problem which is within the soil.next time when planting use ash
User:  57918
by spraying
i hope there are people online from migorg
maize beans and grean vegetables
User:  53389
Using a forked jembe, you weed your coffee Ones a year during drought onset
User:  38965
you can as well use rain water which you have stored to irrigate your crops
YES
User:  2581
Mark Wafula.
The best soil of planting cassava, it needs sand soil, soil that has particles.
LOAM SOIL
The best soil of planting cassava, it needs sand soil, soil that has particles.
User:  58257
a At transplanting apply 20g of DSP per hole, b) Top dress with CAN at 100kg/ha when the plants are 25-30cm high or 2

In [29]:
top_users_for_topic (topics_to_users,0)

[('54878', 0.98732308130218016),
 ('46945', 0.97186803670461841),
 ('55183', 0.96896291320632355),
 ('44611', 0.96785486654547526),
 ('52942', 0.96086712042780442),
 ('19940', 0.95262877775337362),
 ('4076', 0.95262581101780142),
 ('56509', 0.95262238011929623),
 ('48146', 0.94999656920755071),
 ('45052', 0.94999372163195339),
 ('30605', 0.94999320433534173),
 ('35897', 0.94705752702384582),
 ('58415', 0.94705562131469734),
 ('2694', 0.94705413861783494),
 ('40936', 0.9470509903551978),
 ('7713', 0.93999761243477942),
 ('44835', 0.93647152915410992),
 ('42057', 0.93571056947729936),
 ('34417', 0.93570531283471758),
 ('52742', 0.93076617629553604),
 ('29635', 0.93076146482478639),
 ('38058', 0.92692255379407951),
 ('21644', 0.92692165845291541),
 ('60289', 0.92499730293941729),
 ('36381', 0.91805698216320242),
 ('32299', 0.9156465828816378),
 ('38687', 0.90999405595313565),
 ('17766', 0.90998843851350852),
 ('23191', 0.90938627260955551),
 ('25392', 0.904666516987509),
 ('14698', 0.9026

In [30]:
the_model.get_document_topics(the_corpus[0])

[(0, 0.16748129357249819),
 (1, 0.13454891766960206),
 (2, 0.031228345683265341),
 (3, 0.42654866982799711),
 (7, 0.094419222857231019),
 (8, 0.13592423933534267)]

In [46]:
df_msg_en['Clean'] = df_msg_en.apply(cleaner,axis=1)

df_msg_en_q = df_msg_en[(df_msg_en['type'] == 'Q')]
df_msg_en_a = df_msg_en[(df_msg_en['type'] == 'A')]

li_msg_en_q = df_msg_en_q['Clean'].tolist()
li_msg_en_a = df_msg_en_a['Clean'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
sample = li_msg_en_q[:sample_size]
data_model, data_corpus = model(sample)

In [None]:
data_model.get_document_topics(data_corpus[0])

In [None]:
# processing a message to make a prediction
users_topic_model = joblib.load('../Outputs/users_topic_model.pkl' ) 

the_model = users_topic_model[0]
the_corpus = users_topic_model[1]
word_dict = users_topic_model[2]

print(the_corpus[0])
test_message = messages[0]
print(test_message.msg_body.lower())
the_model.get_document_topics(the_corpus[0])
test_message = messages_vectorizer([test_message])
print(test_message)
test_vectorized = word_dict.doc2bow(test_message)

the_model.get_document_topics(test_vectorized)

for k, v in the_model.get_topic_terms(4, topn=25):
    word = word_dict[k]
    print(word)

#for message in messages:
#    the_model.get_document_topics("chicken farming feed")
    


In [44]:
with open('../Outputs/message_predictions.csv', 'wb') as csvfile:    
    mywriter = csv.writer(csvfile, delimiter=',',quotechar='"')
    
    for message in messages:
        scores = topic_scores_for_message(message)
        
        scores_dict={}
        for score in scores:
            scores_dict[score[0]]=score[1]
        scores_arr=[]
        for j in range(10):
            if j in scores_dict.keys():
                scores_arr.append(scores_dict[j])
            else:
                scores_arr.append(0.0)
        output_arr=[message.thread_id, dt.strftime(message.date_time, "%Y-%m-%d %H:%M:%S"), message.message_id, message.user_id, message.language, message.msg_type, message.msg_body]
        for s in scores_arr:
            output_arr.append(str(s))
        mywriter.writerow(output_arr)