In [None]:
from __future__ import division, unicode_literals
import pandas as pd
import re
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk import pos_tag
import gensim
from gensim import corpora
import math
from textblob import TextBlob as tb
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline

###

def clean_init_data(df, speech):
    if speech:
        df['mtn'] = df['mtn'].apply(str)
        df['script'] = df['script'].apply(str)
        #df[df.columns[1]] = df[df.columns[1]].apply(lambda r: pd.to_datetime(r, errors='coerce'))
        #df['mtn'] = df['mtn'].apply(str)
        #df['start_tm'] = df['start_tm'].apply(lambda r: pd.to_datetime(r, errors='coerce'))
        #df['script'] = df['script'].apply(str)
    elif speech == 'Adhoc':
        df['mtn'] = df['mtn'].apply(str)
        df['script'] = df['script'].apply(str)
    else:
        df = df.drop_duplicates(subset='april_chat_campaigndetails.engagement_id')
        df = df[['april_chat_campaigndetails.engagement_id', 'april_chat_campaigndetails.customer_chat_text']]
        df['april_chat_campaigndetails.customer_chat_text'] = df['april_chat_campaigndetails.customer_chat_text'].apply(str)
        df['april_chat_campaigndetails.engagement_id'] = df['april_chat_campaigndetails.engagement_id'].apply(str)
    return df

def min_date(val):
    return val.min()

def concat_text(val):
    return ' '.join(val)

def get_data(speech):
    if speech == 'Speech':
        FILEPATH = '/apps/opt/applicaitons/datasets/Datasets/raw_transcript_5000.txt'
        SCRIPT = 'script'
        df_orig = pd.read_table(FILEPATH, sep = '\t')
        df_orig.rename(columns={df_orig.columns[0]: 'mtn',
                        df_orig.columns[1]: 'start_tm',
                        df_orig.columns[4]: 'script'}, inplace=True)
        df = df_orig
        df = clean_init_data(df, speech)
        df = df.groupby(['mtn']).agg({'start_tm': min_date, 'script': concat_text}).reset_index()
        
    elif speech == 'Adhoc':
        FILEPATH = '/apps/opt/applicaitons/datasets/Datasets/chat_adhoc.txt'
        SCRIPT = 'script'
        df_orig = pd.read_table(FILEPATH, sep = '\t')
        df_orig.rename(columns={df_orig.columns[0]: 'mtn',
                        df_orig.columns[1]: 'script'}, inplace=True)
        df = df_orig
        df = clean_init_data(df, speech)
    else:
        FILEPATH = '/apps/opt/applicaitons/datasets/Datasets/campaigndetails.tsv'
        SCRIPT = 'april_chat_campaigndetails.customer_chat_text'
        df_orig = pd.read_table(FILEPATH, sep = '\t')
        df = df_orig
        df_orig = df_orig.drop_duplicates(subset='april_chat_campaigndetails.engagement_id')
        df = clean_init_data(df, speech)
        
    return df_orig, df, SCRIPT

df_orig, final_df, SCRIPT = get_data(speech = 'Speech')

### Data Cleaning and Tokenization

wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

bogus_words = ['hi', 'ok', 'okay', 'yes', 'customer', 'hello', 'hmm', 'hmmm', 'hmmmm', 'thnx', 'ty', 'ohk', 'hey', 
               'tq', 'hmmk', 'blah', 'bla', 'tku', 'thank', 'thanks', 'nt', 'u', 'e', 'en', 'xx', 't', 'yeah', 'youre', 
               'yeah', 'youre', 'thats', 'thing', 'dont', 'something', 'maam', 'everything', 'name', 'dont', 'anything',
              'didnt', 'mean', 'look', 'guy', 'kind', 'moment', 'wife', 'yesterday', 'sorry', 'not', 'can', 'right',
              'gosh', 'today', 'amir', 'cent', 'forty', 'please', 'system', 'person', 'people', 'can', 'not',
              'doe', 'number', 'hi', 'ok', 'okay', 'yes', 'customer', 'hello', 'hmm', 'hmmm', 'hmmmm', 'thnx', 'ty', 'ohk',
             'hey', 'tq', 'hmmk', 'blah', 'bla', 'tku', 'none', 'None', 'haha', 'gt', 'etc', 'hehe', 'se',
            'yo', 'ya', 'omg', 'technically', 'gracias', 'ah', 'un', 'ci', 'certainly', 'ki', 'dot', 'ti',
            'etc', 'regularly', 'outright', 'bc', 'truly', 'nearly', 'txt', 'significantly', 'per', 'yep',
            'yup', 'nd', 'immediately', 'ma', 'tho', 'umm', 'np', 'gotcha', 'zy', 'lately', 'extremely',
            'ho', 'enjoy', 'youre', 'youve', 'yea', 'yeah', 'heck', 'thru', 'ha', 'sec', 'yup', 'na',
            'thanks', 'thank', 'nt', 'en', 'xx', 'dont', 'thats', 'something', 'maam', 'everything',
            'didnt', 'mean', 'look', 'guy', 'kind', 'moment', 'yesterday', 'sorry', 'not', 'can', 'right',
            'gosh', 'today', 'amir', 'cent', 'please', 'person', 'people', 'can', 'not', 'doe', 'oh',
            'got', 'go', 'like', 'guess', 'much', 'far', 'im', 'still', 'actually', 'really', 'awesome',
            'said', 'hope', 'sure', 'sorry', 'because', 'let', 'morning', 'curious', 'bit', 'cool', 'ti',
            'well', 'nope', 'also', 'basically', 'th', 'null', 'NULL', 'Null', 'its', 'nothing', 'pa', 'mike', 'died',
              'parent', 'isnt', 'tomorrow', 'husband', 'yet', 'want', 'town', 'family', 'years', 'them', 'meet', 'ampamp',
              'stephen', 'till', 'daughter', 'done', 'get', 'you']

def fn_tokenizer(s):
    s = re.sub('x{2,}', '', s)  # remove masking
    s = re.sub('[*]', '', s) # remove masking
    s = re.sub('\d', '', s) # remove digits
    #s = s.translate(None, string.punctuation) # remove punctuations... some version issue
    s = re.sub(' +', ' ', s) # remove multiple spaces
    s = s.lower() # making everyhing lower case
    s = ' '.join(word for word in s.split(' ') if len(word) > 3)
    s = ' '.join(wordnet_lemmatizer.lemmatize(word) for word in s.split(' '))
    s = ' '.join(lemmatizer.lemmatize(word) for word in s.split(' '))
    pos_tuple = pos_tag(s.split()) # parts of speech tagging
    s = ' '.join(word for word, pos in pos_tuple if pos in ['NN', 'NNP', 'NNS', 'NNPS']) # returning only noun
    s = re.sub('''[,.!?:;\"']''', '', s) # # remove punctuations
    s = ' '.join(word for word in s.split(' ') if word not in bogus_words) # removing bogus words
    #tokens = nltk.tokenize.word_tokenize(s) # returns the tokens
    #tokens = [t for t in tokens if len(t) > 3] # only picks the word whose length is more than 3
    #tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]  # lemmatize the tokens
    return s

final_df[SCRIPT] = final_df[SCRIPT].apply(fn_tokenizer)
final_df = final_df[final_df[SCRIPT].apply(lambda x: len(x) > 0)]  # removing the records which has no values.
final_df = final_df.reset_index()
del final_df['index']

### TF-IDF and Topic Modeling

# Term Frequency : Number of times word appears in a document blob, 
# normalized by dividing by the total number of words in the blob
# We use TextBlob for breaking up the text into words and getting the word counts.
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

# Returns the number of documents containing word.
def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

# Computes inverse document frequency, which measures how common a word is among all documents in the bloblist.
# The more common the word is, lower the idf. We take the ratio of the total number of documents to the number of 
# documents containing word, then take a log of that. Add 1 to the divisor to prevent division by 0.
def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

# It computes the TF-IDF score
def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

# Use only if you are using TF:

def fn_topWords(df):
    bloblist = []
    for i, j in df[SCRIPT].iteritems():
        bloblist.append(tb(j))

    pre_topic_dict = {}
    for i, blob in enumerate(bloblist):
        top_words = []
        high_info_words = []
        #scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
        scores = {word: tf(word, blob) for word in blob.words}
        sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)
        threshold = np.nanpercentile([pair[1] for pair in sorted_words], 25) #  taking above 25 percentile
        top_words = [pair[0] for pair in sorted_words if pair[1] > threshold]  # get all the words for that doc
        for ind, word in enumerate(sorted(blob.words)):
            if word in sorted(top_words):
                high_info_words.append(word)
        pre_topic_dict[i] =  high_info_words
    return pre_topic_dict

pre_topic_dict = fn_topWords(final_df)

def pickWords(x, dct):
    thresh = np.nanpercentile([v for k, v in dct.items() if k in x], 25)
    high_info_words = [a for a, b in [(k, v) for k, v in dct.items() if k in x] if b >= thresh]
    
    high_info_list = []
    for word in x.split(' '):
        if word in high_info_words:
            high_info_list.append(word)
    return high_info_list

def computeTF_IDF(df):
    vectorizer = TfidfVectorizer(min_df=1)
    corpus = df[SCRIPT]
    vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    a_dict = dict(zip(vectorizer.get_feature_names(), idf))
    #thresh = df[SCRIPT].apply(lambda x : calcThreshold(x, a_dict))
    #thresh = np.nanpercentile([v for k, v in a_dict.items() if k in final_df[SCRIPT][5]], 25)
    #thresh = np.percentile(idf, 25)
    df[SCRIPT] = df[SCRIPT].apply(lambda x: pickWords(x, a_dict))
    df = df[df[SCRIPT].apply(lambda x: len(x) > 0)]  # removing the records which has no values.
    df = df.reset_index()
    del df['index']

    pre_topic_dict = {}
    for ind, val in enumerate(df[SCRIPT]):
        pre_topic_dict[ind] = val
    
    return pre_topic_dict, df

#pre_topic_dict, final_df = computeTF_IDF(final_df)

def optimize_topic(lsamodel, doc_term_matrix):
    Lsi_2d_data = []  # output of svd will be stored here

    #for v in lsamodel[doc_term_matrix]:
    #    print(v)
    
    for vector in lsamodel[doc_term_matrix]:
        if len(vector) != 2:
            continue
        Lsi_2d_data.append((vector[0][1], vector[1][1]))
        
    # Next I clustered the points in the reduced 2D LSI space using KMeans, varying the number of clusters (K) from 1 to 10
    # The objective function used is the Inertia of the cluster, defined as the sum of squared differences of each point 
    # to its cluster centroid
    
    MAX_K = 10

    X = Lsi_2d_data
    ks = range(1, MAX_K + 1)

    inertias = np.zeros(MAX_K)
    diff = np.zeros(MAX_K)
    diff2 = np.zeros(MAX_K)
    diff3 = np.zeros(MAX_K)

    for k in ks:
        kmeans = KMeans(k).fit(X)
        inertias[k - 1] = kmeans.inertia_
        # first difference    
        if k > 1:
            diff[k - 1] = inertias[k - 1] - inertias[k - 2]
        # second difference
        if k > 2:
            diff2[k - 1] = diff[k - 1] - diff[k - 2]
        # third difference
        if k > 3:
            diff3[k - 1] = diff2[k - 1] - diff2[k - 2]

    elbow = np.argmin(diff3[3:]) + 3
    
    num_of_topics = ks[elbow]
    
    plt.plot(ks, inertias, "b*-")
    plt.plot(ks[elbow], inertias[elbow], marker='o', markersize=12,
             markeredgewidth=2, markeredgecolor='r', markerfacecolor=None)
    plt.ylabel("Inertia")
    plt.xlabel("K")
    plt.show()
    
    print("The Number of Optimal Topics: ", num_of_topics)
    
    #X = a
    #kmeans = KMeans(NUM_TOPICS).fit(X)
    #y = kmeans.labels_

    #colors = ["b", "g", "r", "m", "c"]
    #for i in range(np.array(X).shape[0]):
    #    plt.scatter(X[i][0], X[i][1], c=colors[y[i]], s=10)    
    #plt.show()
    
    return num_of_topics

def topic_modeling(pre_topic_dict):
    doc_clean = []
    for k, v in pre_topic_dict.items():
        doc_clean.append(v)
        
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)
    
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    
    # I didn't know, how many topics this corpus should yield, so I decided to compute this by reducing the features to 
    # two dimensions, then clustering the points for different values of K (number of clusters) to find an optimum value.
    # To project the vectors in a corpus to a different coordinate space (say 2D), we will use LSA (Latent Semantic Analysis).
    # This is Singular Value Decomposition (SVD)
    
    Lsi = gensim.models.LsiModel
    
    # project to 2 dimensions for visualization
    lsamodel = Lsi(doc_term_matrix, id2word=dictionary, num_topics=2)
    
    num_of_topics = optimize_topic(lsamodel, doc_term_matrix)
    #num_of_topics = 15
    
    # Running LDA Model
    # Next step is to create an object for LDA model and train it on Document-Term matrix. 
    # Creating the object for LDA model using gensim library
    #Lda = gensim.models.ldamodel.LdaModel
    Lda = gensim.models.ldamulticore.LdaMulticore
    
    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=num_of_topics, id2word = dictionary, passes=1, chunksize = 500)
    
    return ldamodel, doc_term_matrix, num_of_topics

lda_model, dtm, num_of_topics = topic_modeling(pre_topic_dict)

def find_topic(aList):
    num_topics = len(aList)
    mean_top_prob = np.mean([lst_itm[1] for lst_itm in aList]) - .01  # to take the border line topics (1 % tolerance)
    if num_topics == 1:
        return [aList[0][0]]  # return the only topic
    elif num_topics == 2:
        return [lst_itm[0] for lst_itm in aList if lst_itm[1] >= mean_top_prob + .05] # one topic should have at least 55% probability
    else:
        return [lst_itm[0] for lst_itm in aList if lst_itm[1] >= mean_top_prob*1.2] # one topic atleast 1.2 the mean prob.
    

def topic_doc_map(lda_model, dtm, num_of_topics):
    lda_corpus = lda_model[dtm]
    
    topic_doc_dict = {}
    for doc, topic in enumerate(lda_corpus):
        #topic_doc_dict[doc] = sorted(topic, key = lambda x: x[1], reverse = True)[0][0]
        topic_doc_dict[doc] = find_topic(topic)
        
    topic_word_dict = {}
    num_of_topics = num_of_topics

    all_topics = []
    for topic in range(num_of_topics):
        all_tokens = lda_model.get_topic_terms(topic) # it's a tuple (word_id, weight) for a given topic
        wt_thresh = np.percentile([tup[1] for tup in all_tokens], 25) # setting a weight threshold
        selected_tokens = [tup[0] for tup in all_tokens if tup[1] > wt_thresh]
        total_tokens = len(selected_tokens)
        all_topics.append((topic, lda_model.print_topic(topic, total_tokens))) # final topic and number of tokens list
        
    all_topics = [(tup[0], (lambda a: re.findall("[a-zA-Z]+", a))(tup[1])) for tup in all_topics]  # removing the weights
    
    for topic in all_topics:
        topic_word_dict[topic[0]] = topic[1]
        
    return topic_doc_dict, topic_word_dict

topic_doc_dict, topic_word_dict = topic_doc_map(lda_model, dtm, num_of_topics)

def topic_extract(r):
    num_top = len(r)
    if num_top == 0:
        return 'OTHER_TOPIC'
    elif num_top == 1:
        return topic_word_dict[r[0]]
    elif num_top > 1:
        return dict((el, topic_word_dict[el]) for el in r)
    

def present_data(final_df, topic_doc_dict, topic_word_dict):
    topic_df = pd.concat((final_df, pd.Series(topic_doc_dict, name='topic')), axis = 1)
    topic_df['topic_tokens'] = topic_df['topic'].apply(topic_extract)
    topic_df.to_csv('final_output_text_topic.csv')
    return topic_df

topic_word_dict

present_data(final_df, topic_doc_dict, topic_word_dict).head(5)