In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [38]:
# more words to remove and remove words longer than length 2 

words = ['class', 'file', 'use', 'code','string','function']
feature = feature.apply(lambda x: [w for w in x if len(w)>2 and w not in words ])

In [4]:
def coef_features(modelname, lg = True):
    labels = modelname.classes_  # label 
    if lg:
        coefs = modelname.best_estimator_['clf'].coef_
    else:
        coefs = modelname.best_estimator_['clf'].feature_log_prob_
    featurenames = modelname.best_estimator_['vect'].get_feature_names()
    coef_dict={}
    for i, l in enumerate(labels):
        coef_dict[l]=[]
        for c, f in zip(coefs[i],featurenames):
            if c:
                coef_dict[l].append((f,c))
    return coef_dict   

In [None]:
lg_coef_f = coef_features(t_lg)
nb_coef_f = coef_features(t_nb)

In [None]:
def top_10_feature(coef_dict):
    top_10 = {}
    for l in coef_dict.keys():
        top_10[l] = sorted(coef_dict[l], key = lambda x: x[1], reverse = True)[:10]
        top_10[l] = [x[0] for x in top_10[l]]
    return pd.DataFrame(top_10)

In [None]:
pd.concat([top_10_feature(lg_coef_f),top_10_feature(nb_coef_f)], axis = 1)

    

## feature engineering for clustering

In [39]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [40]:
# remove extreme high or low counts 
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [41]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [42]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [43]:
corpus_tfidf = tfidf[bow]

In [44]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [None]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.013*"get" + 0.012*"name" + 0.010*"user" + 0.009*"like" + 0.009*"new" + 0.009*"system" + 0.008*"object" + 0.007*"data" + 0.007*"try" + 0.007*"way"'),
 (1,
  '0.023*"int" + 0.018*"public" + 0.016*"return" + 0.015*"type" + 0.013*"value" + 0.012*"std" + 0.010*"void" + 0.010*"new" + 0.009*"foo" + 0.009*"object"'),
 (2,
  '0.013*"var" + 0.012*"list" + 0.010*"text" + 0.009*"get" + 0.009*"like" + 0.009*"new" + 0.008*"http" + 0.008*"div" + 0.008*"return" + 0.008*"value"'),
 (3,
  '0.010*"test" + 0.008*"like" + 0.008*"time" + 0.008*"new" + 0.008*"would" + 0.008*"method" + 0.008*"one" + 0.007*"thread" + 0.007*"object" + 0.007*"get"'),
 (4,
  '0.019*"org" + 0.013*"version" + 0.012*"project" + 0.011*"run" + 0.010*"lib" + 0.010*"error" + 0.009*"build" + 0.008*"com" + 0.007*"apache" + 0.007*"include"')]

## tfidf LDA model

## filter out these words:
* make sure words are longer than 2 letters 

* class, file, use, code,string,function

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [None]:
lda_model_tfidf.print_topics()

In [None]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [None]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [None]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [None]:
result.columns = ['bow','tdif','true_label']

In [None]:
result.true_label.unique

In [None]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

In [None]:
result[mask1]['bow'].value_counts(normalize = True)

In [None]:
result[mask2]['bow'].value_counts(normalize = True)

In [None]:
result[mask3]['bow'].value_counts(normalize = True)

In [None]:
result[mask4]['bow'].value_counts(normalize = True)

In [None]:
result[mask5]['bow'].value_counts(normalize = True)

## tdif

In [None]:
result[mask1]['tdif'].value_counts(normalize = True)

In [None]:
result[mask2]['tdif'].value_counts(normalize = True)

In [None]:
result[mask3]['tdif'].value_counts(normalize = True)

In [None]:
result[mask4]['tdif'].value_counts(normalize = True)

In [None]:
result[mask5]['tdif'].value_counts(normalize = True)

In [None]:
result.head()