# Data Pull

In [48]:
import pandas as pd

from pymongo import MongoClient

client = MongoClient('localhost:27017')
db = client.teamspeed
collection = db.forum_teamspeed

dataset = []
for element in collection.find():
    dataset.append(element)
    
df = pd.DataFrame(dataset)

## Merge subject and post

In [50]:
df['full_verbatim'] = df.apply(lambda x: x['subject'] + " " + x['post'],axis=1)

In [None]:
len(df)

# Data Cleaning

In [57]:
import re, itertools
import nltk
from nltk.corpus import stopwords 

def data_cleaning(verbatim):
    verbatim = verbatim.strip() #remove whitespaces
    verbatim = re.sub(r'<[^<]+?>', ' ', verbatim) #remove html tags
    verbatim = re.sub(r'https?:\/\/.*[\r\n]*', ' ', verbatim, flags=re.MULTILINE) #remove urls
    verbatim = re.sub(r'[^\w\s]',' ',verbatim) #remove ponctuation
    verbatim = ''.join(''.join(s)[:2] for _, s in itertools.groupby(verbatim)) #Standardize words
    verbatim = ' '.join(re.findall('[A-Z][^A-Z]*', verbatim)) #Split attached words
    verbatim = verbatim.lower() #Lowercase
    verbatim = ' '.join([word for word in verbatim.split() if word not in (stopwords.words('english'))]) #Stopwords
    tokens = nltk.word_tokenize(verbatim) #Tokenize
    return(tokens)

In [59]:
df['cleaned'] = df.apply(lambda x: data_cleaning(x['full_verbatim']),axis=1)

# POS Extraction

In [61]:
def pos_extraction(tokens):
    pos_tokens = nltk.pos_tag(tokens)
    return(pos_tokens)


def select_pos(pos_tokens,lst_pos):
    subset = [pos_token[0] for pos_token in pos_tokens for pos in lst_pos if pos_token[1].startswith(pos)]
    return(subset)


In [62]:
df['pos'] = df.apply(lambda x: pos_extraction(x['cleaned']),axis=1)

In [63]:
df['cleaned_nn_jj'] = df.apply(lambda x: select_pos(x['pos'],['NN','JJ']),axis=1)

In [65]:
def make_string(token_lst):
    cleaned = [token for token in token_lst if len(token) > 2]
    return(" ".join(cleaned))
    

In [66]:
df['str_cleaned_nn_jj'] = df.apply(lambda x: make_string(x['cleaned_nn_jj']),axis=1)

# Topic Modelling

In [86]:
run -i lda_script.ipynb

In [None]:
loglikelihoods = []

for i in range(10,50,10):
    model_lda = topic_analysis()
    model_lda.get_results(df,'str_cleaned_nn_jj',i)
    loglikelihoods.append((i,model_lda.model.loglikelihood()))

In [None]:
element = max(enumerate(loglikelihoods), key=lambda x: x[1])
print("Optimal number of topics : {} ({})".format(element[1][0],element[1][1]))

# Topic Model Results

In [99]:
model_lda.topics

[(0,
  'watch steel case time watches black dial new call hours gold movement trade strap special rolex prices available perfect date photos panerai details hour night crystal sapphire stainless water automatic specs complete local delivery wide price chronograph mike power reserve clock sale hand bezel boxes papers size crown brand',
  '0.045781035469107556 0.019679776887871852 0.019036184210526316 0.01827817505720824 0.015489273455377574 0.013172339816933639 0.013086527459954233 0.012228403890160182 0.012056779176201373 0.011970966819221968 0.011799342105263158 0.011055635011441647 0.01029762585812357 0.010226115560640733 0.009282179633867276 0.00892462814645309 0.008853117848970252 0.008595680778032036 0.008266733409610984 0.008138014874141876 0.00812371281464531 0.0077232551487414185 0.0076660469107551485 0.007294193363844394 0.007208381006864988 0.007151172768878718 0.007093964530892448 0.006993850114416476 0.006936641876430206 0.006865131578947368 0.006607694508009153 0.006564788

In [94]:
loglikelihoods

[(10, -21057434.972703733),
 (20, -20951459.22897517),
 (30, -20999990.31793885),
 (40, -21027163.26144508)]

In [143]:
import matplotlib.pyplot as plt
import numpy as np


def print_topics(model_lda):
    topics = model_lda.topics
    dataframe = model_lda.dataframe

    for topic in topics:
        labels = topic[1].split(" ")
        dx = [float(x) for x in topic[2].split(" ")]

        fig, ax = plt.subplots()
        ind = range(0,len(labels))

        width = 0.25  

        ax.bar(ind,dx ,width, color='b')
        ax.set_xticks(np.arange(len(ind)) + width/2)
        ax.set_xticklabels(labels,rotation=90)

        plt.tight_layout()
        #plt.show() #Optional
        plt.savefig('topic' + str(topic[0]) + '.png', dpi=300)

        subset = dataframe[dataframe['Topic_id']==topic[0]]

        print("Topic " + str(topic[0]) + ": \n\n\n")
        for i,row in subset.sample(10).iterrows():
            print(row['full_verbatim'] + "\n")


In [None]:
print_topics(model_lda)