In [36]:
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [4]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [6]:
def coef_features(modelname, lg = True):
    labels = modelname.classes_  # label 
    if lg:
        coefs = modelname.best_estimator_['clf'].coef_
    else:
        coefs = modelname.best_estimator_['clf'].feature_log_prob_
    featurenames = modelname.best_estimator_['vect'].get_feature_names()
    coef_dict={}
    for i, l in enumerate(labels):
        coef_dict[l]=[]
        for c, f in zip(coefs[i],featurenames):
            if c:
                coef_dict[l].append((f,c))
    return coef_dict
    
    
    

In [7]:
lg_coef_f = coef_features(t_lg)
nb_coef_f = coef_features(t_nb)

In [8]:
def top_10_feature(coef_dict):
    top_10 = {}
    for l in coef_dict.keys():
        top_10[l] = sorted(coef_dict[l], key = lambda x: x[1], reverse = True)[:10]
        top_10[l] = [x[0] for x in top_10[l]]
    return pd.DataFrame(top_10)

In [9]:
pd.concat([top_10_feature(lg_coef_f),top_10_feature(nb_coef_f)], axis = 1)

    

Unnamed: 0,c#,c++,java,javascript,python,c#.1,c++.1,java.1,javascript.1,python.1
0,writeline,cout,jvm,jquery,django,string,std,string,function,list
1,net,std,jdk,backbone,py,use,int,class,jquery,file
2,msdn,boost,println,console log,numpy,public,use,use,var,use
3,ienumerable,qt,spring,prototype,def,new,const,method,use,print
4,script jquery,stl,jsp,angularjs,pythonic,class,function,public,div,py
5,streamreader,cpp,system println,jsfiddle,matplotlib,get,class,file,page,like
6,linq,int main,hibernate,angular,pep,net,code,new,html,way
7,window form,header file,jar,alert,urllib,method,vector,get,like,self
8,winforms,gcc,util,browser,beautifulsoup,code,include,code,script,import
9,entity framework,pointer,jdbc,ecmascript,typeerror,list,compiler,like,element,get


## feature engineering for clustering

In [7]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [8]:
# remove extreme high or low counts 
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [9]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [10]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [11]:
corpus_tfidf = tfidf[bow]

In [12]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [13]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.032*"string" + 0.029*"public" + 0.022*"class" + 0.017*"new" + 0.015*"object" + 0.014*"method" + 0.014*"return" + 0.013*"system" + 0.012*"get" + 0.012*"use"'),
 (1,
  '0.016*"id" + 0.014*"name" + 0.012*"new" + 0.011*"file" + 0.010*"data" + 0.010*"value" + 0.009*"function" + 0.009*"use" + 0.009*"text" + 0.008*"var"'),
 (2,
  '0.022*"use" + 0.012*"function" + 0.010*"like" + 0.009*"code" + 0.009*"would" + 0.008*"way" + 0.007*"one" + 0.006*"work" + 0.006*"want" + 0.006*"class"'),
 (3,
  '0.016*"file" + 0.013*"use" + 0.011*"org" + 0.011*"http" + 0.009*"get" + 0.008*"com" + 0.008*"run" + 0.008*"error" + 0.008*"application" + 0.008*"version"'),
 (4,
  '0.025*"x" + 0.023*"c" + 0.013*"b" + 0.012*"std" + 0.011*"int" + 0.010*"use" + 0.008*"f" + 0.008*"foo" + 0.007*"list" + 0.007*"value"')]

## tfidf LDA model

In [14]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [15]:
lda_model_tfidf.print_topics()

[(0,
  '0.006*"class" + 0.006*"x" + 0.005*"c" + 0.005*"int" + 0.004*"method" + 0.004*"value" + 0.004*"object" + 0.004*"function" + 0.003*"use" + 0.003*"type"'),
 (1,
  '0.005*"string" + 0.004*"function" + 0.004*"file" + 0.004*"value" + 0.004*"script" + 0.004*"int" + 0.003*"page" + 0.003*"list" + 0.003*"array" + 0.003*"html"'),
 (2,
  '0.005*"function" + 0.004*"c" + 0.004*"string" + 0.003*"use" + 0.003*"event" + 0.003*"b" + 0.003*"like" + 0.003*"object" + 0.003*"list" + 0.003*"div"'),
 (3,
  '0.005*"file" + 0.004*"project" + 0.004*"application" + 0.004*"use" + 0.004*"test" + 0.003*"c" + 0.003*"run" + 0.003*"class" + 0.003*"get" + 0.003*"error"'),
 (4,
  '0.009*"string" + 0.007*"list" + 0.007*"date" + 0.006*"foo" + 0.005*"class" + 0.004*"object" + 0.004*"value" + 0.004*"array" + 0.004*"public" + 0.004*"bar"')]

In [17]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [18]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [22]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [24]:
result.columns = ['bow','tdif','true_label']

In [66]:
result

Unnamed: 0,bow,tdif,true_label
0,2,2,javascript
1,4,0,c++
2,2,1,javascript
3,4,1,python
4,0,4,javascript
5,2,0,javascript
6,2,1,javascript
7,2,3,javascript
8,0,4,java
9,2,3,c++


In [59]:
result.true_label.unique

<bound method Series.unique of 0        javascript
1               c++
2        javascript
3            python
4        javascript
            ...    
30297            c#
30298          java
30299        python
30300          java
30301    javascript
Name: true_label, Length: 30302, dtype: object>

In [60]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

In [61]:
result[mask1]['bow'].value_counts(normalize = True)

2    0.453443
4    0.429972
0    0.053908
3    0.052876
1    0.009801
Name: bow, dtype: float64

In [62]:
result[mask2]['bow'].value_counts(normalize = True)

2    0.536578
1    0.264833
3    0.096198
0    0.061348
4    0.041043
Name: bow, dtype: float64

In [63]:
result[mask3]['bow'].value_counts(normalize = True)

0    0.355425
2    0.312482
3    0.161752
1    0.116662
4    0.053679
Name: bow, dtype: float64

In [64]:
result[mask4]['bow'].value_counts(normalize = True)

2    0.307065
4    0.284975
3    0.238607
1    0.087960
0    0.081393
Name: bow, dtype: float64

In [65]:
result[mask5]['bow'].value_counts(normalize = True)

2    0.332262
0    0.280321
3    0.260107
1    0.082597
4    0.044712
Name: bow, dtype: float64