In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
from sklearn.metrics import confusion_matrix, accuracy_score
warnings.filterwarnings('ignore')
import re
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [4]:
# remove words longer than length 2

feature = feature.apply(
    lambda x: [w for w in x if len(w) > 2])

## feature engineering for clustering

In [6]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [7]:
dictionary.filter_extremes(no_below =1, no_above=0.05, keep_n=50000)

In [8]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]
# tfidf for bow 
tfidf = models.TfidfModel(bow)
corpus_tfidf = tfidf[bow]

In [9]:
# generate a model for bag of words with 5 topics 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [12]:
# generate a model for bag of tfidf with 5 topics 
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [11]:
def topic_top_word(model):
    '''
    input:
    model: lda_model (bow or tfidf)
    return:
    a dataframe with top words for each topic 
    '''
    topics= model.print_topics(num_topics=5,num_words=5) 
    topics_dict = {}
    for topic in topics:
        topics_dict[topic[0]] = re.findall('[a-z]+',topic[1])
    df = pd.DataFrame(topics_dict)
    df.columns = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']
    return df

In [None]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [None]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [None]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [None]:
result.columns = ['bow','tdif','true_label']

### combine model to original df and look at text

In [None]:
combined_df = pd.concat([posts_df[['text']],result], axis = 1)

### function to look at each topic separately

In [None]:
def text_topic(df, model,topic):
    '''
    input:
    df: raw text, lda model, and topic num
    returns:
    random text for that topic 
    '''
    texts = df[df[model]==topic].text
    inds = df[df[model]==topic].text.index
    ind = np.random.choice(inds)
    return texts[ind]
    

## Topic 0

In [None]:
text_topic(combined_df,'bow',4)

In [None]:
text_topic(combined_df,'bow',4)

In [None]:
combined_df[combined_df.bow==0].text.iloc[2000]

In [None]:
combined_df[combined_df.bow==0].text.iloc[500]

## Topic 1:

In [None]:
combined_df[combined_df.bow==1].text.iloc[10]

In [None]:
combined_df[combined_df.bow==1].text.iloc[2001]

In [None]:
combined_df[combined_df.bow==1].text.iloc[456]

## Topic 2:

In [None]:
combined_df[combined_df.bow==2].text.iloc[2001]

In [None]:
combined_df[combined_df.bow==2].text.iloc[201]

In [None]:
combined_df[combined_df.bow==2].text.iloc[45]