In [93]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [95]:
# more words to remove and remove words longer than length 2

words = ['class', 'file', 'use', 'code',
         'string', 'function', 'new', 'like','way','would']
feature = feature.apply(
    lambda x: [w for w in x if len(w) > 2 and w not in words])

In [96]:
def coef_features(modelname, lg = True):
    labels = modelname.classes_  # label 
    if lg:
        coefs = modelname.best_estimator_['clf'].coef_
    else:
        coefs = modelname.best_estimator_['clf'].feature_log_prob_
    featurenames = modelname.best_estimator_['vect'].get_feature_names()
    coef_dict={}
    for i, l in enumerate(labels):
        coef_dict[l]=[]
        for c, f in zip(coefs[i],featurenames):
            if c:
                coef_dict[l].append((f,c))
    return coef_dict   

In [None]:
with open('gs_nb','rb') as f:
    t_nb = pickle.load(f)
with open('gs_lg','rb') as f:
    t_lg = pickle.load(f)

In [None]:
lg_coef_f = coef_features(t_lg)
nb_coef_f = coef_features(t_nb)

In [None]:
def top_10_feature(coef_dict):
    top_10 = {}
    for l in coef_dict.keys():
        top_10[l] = sorted(coef_dict[l], key = lambda x: x[1], reverse = True)[:10]
        top_10[l] = [x[0] for x in top_10[l]]
    return pd.DataFrame(top_10)

In [None]:
pd.concat([top_10_feature(lg_coef_f),top_10_feature(nb_coef_f)], axis = 1)

    

## feature engineering for clustering

In [97]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [98]:
# remove extreme high or low counts 
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [99]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [100]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [101]:
corpus_tfidf = tfidf[bow]

In [102]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [103]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.013*"value" + 0.012*"int" + 0.011*"array" + 0.010*"return" + 0.009*"time" + 0.008*"list" + 0.008*"android" + 0.007*"image" + 0.007*"want" + 0.007*"date"'),
 (1,
  '0.021*"public" + 0.016*"int" + 0.016*"method" + 0.013*"return" + 0.012*"void" + 0.010*"static" + 0.010*"type" + 0.009*"object" + 0.008*"system" + 0.008*"call"'),
 (2,
  '0.011*"http" + 0.011*"script" + 0.010*"error" + 0.009*"system" + 0.009*"lib" + 0.008*"html" + 0.008*"page" + 0.008*"org" + 0.008*"try" + 0.007*"work"'),
 (3,
  '0.007*"std" + 0.007*"work" + 0.007*"run" + 0.006*"find" + 0.006*"need" + 0.006*"one" + 0.006*"know" + 0.006*"version" + 0.006*"library" + 0.006*"project"'),
 (4,
  '0.022*"name" + 0.017*"object" + 0.011*"list" + 0.011*"return" + 0.011*"var" + 0.011*"user" + 0.010*"value" + 0.010*"public" + 0.009*"property" + 0.009*"test"')]

## tfidf LDA model

## filter out these words:
* make sure words are longer than 2 letters 

* class, file, use, code,string,function

In [104]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [None]:
lda_model_tfidf.print_topics()

[(0,
  '0.005*"text" + 0.004*"foo" + 0.004*"form" + 0.004*"public" + 0.004*"event" + 0.004*"button" + 0.004*"window" + 0.004*"click" + 0.004*"type" + 0.004*"int"'),
 (1,
  '0.006*"list" + 0.005*"test" + 0.005*"project" + 0.004*"android" + 0.004*"application" + 0.004*"run" + 0.004*"eclipse" + 0.004*"public" + 0.003*"want" + 0.003*"method"'),
 (2,
  '0.005*"div" + 0.005*"array" + 0.005*"std" + 0.004*"element" + 0.004*"object" + 0.004*"data" + 0.004*"var" + 0.004*"json" + 0.004*"jquery" + 0.003*"list"'),
 (3,
  '0.005*"value" + 0.005*"number" + 0.004*"name" + 0.004*"array" + 0.004*"line" + 0.004*"want" + 0.004*"character" + 0.003*"int" + 0.003*"model" + 0.003*"type"'),
 (4,
  '0.006*"date" + 0.006*"int" + 0.004*"object" + 0.004*"method" + 0.004*"thread" + 0.004*"public" + 0.004*"time" + 0.004*"datetime" + 0.004*"return" + 0.003*"system"')]

In [None]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [None]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [None]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [None]:
result.columns = ['bow','tdif','true_label']

In [None]:
result.true_label.unique

In [None]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

## bow:

* c++: topic 
* javascript: class 2 

In [None]:
result[mask1]['bow'].value_counts(normalize = True)

In [None]:
result[mask2]['bow'].value_counts(normalize = True)

In [None]:
result[mask3]['bow'].value_counts(normalize = True)

In [None]:
result[mask4]['bow'].value_counts(normalize = True)

In [None]:
result[mask5]['bow'].value_counts(normalize = True)

## tdif

In [None]:
result[mask1]['tdif'].value_counts(normalize = True)

In [None]:
result[mask2]['tdif'].value_counts(normalize = True)

In [None]:
result[mask3]['tdif'].value_counts(normalize = True)

In [None]:
result[mask4]['tdif'].value_counts(normalize = True)

In [None]:
result[mask5]['tdif'].value_counts(normalize = True)

In [None]:
result.head()