In [130]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
%matplotlib inline

In [131]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [132]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [158]:
# more words to remove and remove words longer than length 2

words = ['class', 'file', 'use', 'code',
         'string', 'function', 'new', 'like', 
         'way', 'would', 'name', 'get', 'list',
         'want', 'value','return','data']
feature = feature.apply(
    lambda x: [w for w in x if len(w) > 2 and w not in words])

In [159]:
def coef_features(modelname, lg = True):
    labels = modelname.classes_  # label 
    if lg:
        coefs = modelname.best_estimator_['clf'].coef_
    else:
        coefs = modelname.best_estimator_['clf'].feature_log_prob_
    featurenames = modelname.best_estimator_['vect'].get_feature_names()
    coef_dict={}
    for i, l in enumerate(labels):
        coef_dict[l]=[]
        for c, f in zip(coefs[i],featurenames):
            if c:
                coef_dict[l].append((f,c))
    return coef_dict   

In [160]:
import pickle
with open('gs_nb','rb') as f:
    t_nb = pickle.load(f)
with open('gs_lg','rb') as f:
    t_lg = pickle.load(f)

In [125]:
lg_coef_f = coef_features(t_lg)
nb_coef_f = coef_features(t_nb)

In [126]:
def top_10_feature(coef_dict):
    top_10 = {}
    for l in coef_dict.keys():
        top_10[l] = sorted(coef_dict[l], key = lambda x: x[1], reverse = True)[:10]
        top_10[l] = [x[0] for x in top_10[l]]
    return pd.DataFrame(top_10)

In [127]:
pd.concat([top_10_feature(lg_coef_f),top_10_feature(nb_coef_f)], axis = 1)

    

Unnamed: 0,c#,c++,java,javascript,python,c#.1,c++.1,java.1,javascript.1,python.1
0,writeline,cout,jvm,jquery,django,string,std,string,function,list
1,net,std,jdk,backbone,py,use,int,class,jquery,file
2,msdn,boost,println,console log,numpy,public,use,use,var,use
3,ienumerable,qt,spring,prototype,def,new,const,method,use,print
4,script jquery,stl,jsp,angularjs,pythonic,class,function,public,div,py
5,streamreader,cpp,system println,jsfiddle,matplotlib,get,class,file,page,like
6,linq,int main,hibernate,angular,pep,net,code,new,html,way
7,window form,header file,jar,alert,urllib,method,vector,get,like,self
8,winforms,gcc,util,browser,beautifulsoup,code,include,code,script,import
9,entity framework,pointer,jdbc,ecmascript,typeerror,list,compiler,like,element,get


## feature engineering for clustering

In [161]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [162]:
# remove extreme high or low counts 
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [163]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [164]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [165]:
corpus_tfidf = tfidf[bow]

In [166]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [167]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.013*"system" + 0.011*"application" + 0.010*"project" + 0.010*"run" + 0.008*"work" + 0.008*"version" + 0.007*"try" + 0.007*"library" + 0.007*"find" + 0.006*"need"'),
 (1,
  '0.014*"int" + 0.010*"std" + 0.008*"one" + 0.008*"type" + 0.007*"question" + 0.006*"example" + 0.006*"time" + 0.006*"know" + 0.006*"make" + 0.005*"android"'),
 (2,
  '0.016*"http" + 0.015*"org" + 0.013*"com" + 0.011*"div" + 0.010*"text" + 0.009*"html" + 0.009*"lib" + 0.009*"script" + 0.008*"test" + 0.008*"user"'),
 (3,
  '0.036*"public" + 0.018*"object" + 0.015*"void" + 0.014*"int" + 0.014*"method" + 0.012*"set" + 0.011*"property" + 0.010*"private" + 0.010*"static" + 0.009*"call"'),
 (4,
  '0.011*"try" + 0.010*"var" + 0.009*"array" + 0.009*"object" + 0.008*"work" + 0.008*"line" + 0.008*"key" + 0.008*"method" + 0.007*"date" + 0.007*"error"')]

## tfidf LDA model

## filter out these words:
* make sure words are longer than 2 letters 

* class, file, use, code,string,function

In [168]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [169]:
lda_model_tfidf.print_topics()

[(0,
  '0.007*"div" + 0.005*"text" + 0.005*"button" + 0.005*"jquery" + 0.005*"click" + 0.005*"html" + 0.004*"form" + 0.004*"var" + 0.004*"page" + 0.004*"element"'),
 (1,
  '0.009*"int" + 0.006*"std" + 0.006*"thread" + 0.004*"method" + 0.004*"key" + 0.004*"type" + 0.004*"object" + 0.004*"public" + 0.004*"char" + 0.003*"true"'),
 (2,
  '0.007*"object" + 0.007*"array" + 0.006*"public" + 0.006*"date" + 0.005*"int" + 0.005*"method" + 0.004*"type" + 0.004*"foo" + 0.003*"one" + 0.003*"static"'),
 (3,
  '0.004*"event" + 0.004*"character" + 0.004*"time" + 0.004*"line" + 0.003*"map" + 0.003*"var" + 0.003*"json" + 0.003*"replace" + 0.003*"need" + 0.003*"number"'),
 (4,
  '0.004*"project" + 0.004*"application" + 0.004*"run" + 0.004*"http" + 0.004*"test" + 0.003*"error" + 0.003*"system" + 0.003*"library" + 0.003*"web" + 0.003*"server"')]

In [170]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [171]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [194]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [195]:
result.columns = ['bow','tdif','true_label']

In [196]:
result.true_label.unique

<bound method Series.unique of 0        javascript
1               c++
2        javascript
3            python
4        javascript
            ...    
30297            c#
30298          java
30299        python
30300          java
30301    javascript
Name: true_label, Length: 30302, dtype: object>

In [174]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

## bow:

* c++: topic 1
* javascript: topic 2 
* c#: topic 3
* python:topic 4 
* java: topic 0 

In [175]:
result[mask1]['bow'].value_counts(normalize = True)

1    0.741553
0    0.140831
3    0.068868
4    0.040753
2    0.007996
Name: bow, dtype: float64

In [176]:
result[mask2]['bow'].value_counts(normalize = True)

2    0.331509
4    0.259793
0    0.191820
1    0.156394
3    0.060484
Name: bow, dtype: float64

In [177]:
result[mask3]['bow'].value_counts(normalize = True)

3    0.287861
0    0.258088
1    0.201403
4    0.200973
2    0.051675
Name: bow, dtype: float64

In [178]:
result[mask4]['bow'].value_counts(normalize = True)

4    0.406965
1    0.232040
0    0.201990
2    0.101493
3    0.057512
Name: bow, dtype: float64

In [179]:
result[mask5]['bow'].value_counts(normalize = True)

0    0.277510
1    0.263989
4    0.200134
3    0.176841
2    0.081526
Name: bow, dtype: float64

## tdif

## tdif:

* c++: topic 1
* javascript: topic 0 
* c#: topic 2
* python:topic 3
* java: topic 4

In [180]:
result[mask1]['tdif'].value_counts(normalize = True)

1    0.590405
2    0.204798
4    0.155275
3    0.028372
0    0.021150
Name: tdif, dtype: float64

In [181]:
result[mask2]['tdif'].value_counts(normalize = True)

0    0.506768
4    0.168491
2    0.144729
3    0.105415
1    0.074597
Name: tdif, dtype: float64

In [182]:
result[mask3]['tdif'].value_counts(normalize = True)

2    0.310335
4    0.254223
1    0.236187
0    0.140281
3    0.058975
Name: tdif, dtype: float64

In [183]:
result[mask4]['tdif'].value_counts(normalize = True)

4    0.300100
2    0.234826
1    0.229851
3    0.124975
0    0.110249
Name: tdif, dtype: float64

In [184]:
result[mask5]['tdif'].value_counts(normalize = True)

4    0.376975
2    0.281258
1    0.196787
0    0.076975
3    0.068005
Name: tdif, dtype: float64

In [197]:
bow_map = {0:'java',1:'c++',2:'javascript',3:'c#',4:'python'}
tdif_map = {0:'javascript',1:'c++',2:'c#',3:'python',4:'java'}

result.bow = result.bow.map(bow_map)
result.tdif = result.tdif.map(tdif_map)

In [198]:
result

Unnamed: 0,bow,tdif,true_label
0,c++,c#,javascript
1,c++,c++,c++
2,javascript,javascript,javascript
3,python,c#,python
4,python,c#,javascript
5,python,javascript,javascript
6,javascript,javascript,javascript
7,java,java,javascript
8,python,c#,java
9,c++,c#,c++


# confusion matrix bow

In [199]:
confusion_matrix(result.bow,result.true_label)

array([[2011,  267, 1321,  420,  289],
       [1407, 2875, 1972, 1086, 1166],
       [1803,  546, 2073, 1332, 1015],
       [ 361,   31,  609, 2302,  510],
       [1404,  158, 1495, 1804, 2045]])

In [200]:
confusion_matrix(result.tdif,result.true_label)

array([[2168,  794, 2101, 1005, 1180],
       [1650, 2289, 1470,  518, 1155],
       [1776,  602, 2816, 1170, 1508],
       [ 980,   82,  575, 3519,  554],
       [ 412,  110,  508,  732,  628]])