In [14]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
from sklearn.metrics import confusion_matrix, accuracy_score
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [4]:
# more words to remove and remove words longer than length 2
words = ['class', 'file', 'use', 'code',
         'string', 'function', 'new', 'like', 
         'way', 'would', 'name', 'get', 'list',
         'want', 'value','return','data']
feature = feature.apply(
    lambda x: [w for w in x if len(w) > 2 and w not in words])

## feature engineering for clustering

In [5]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)

In [6]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [7]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [8]:
corpus_tfidf = tfidf[bow]

In [15]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [16]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.008*"android" + 0.007*"div" + 0.006*"try" + 0.006*"error" + 0.006*"com" + 0.005*"run" + 0.005*"work" + 0.005*"line" + 0.005*"find" + 0.004*"text"'),
 (1,
  '0.010*"http" + 0.010*"org" + 0.009*"type" + 0.007*"user" + 0.007*"com" + 0.006*"script" + 0.006*"object" + 0.006*"page" + 0.006*"method" + 0.005*"web"'),
 (2,
  '0.014*"int" + 0.012*"test" + 0.010*"std" + 0.007*"error" + 0.006*"include" + 0.006*"method" + 0.006*"item" + 0.006*"work" + 0.005*"try" + 0.005*"public"'),
 (3,
  '0.009*"int" + 0.007*"one" + 0.007*"time" + 0.007*"type" + 0.006*"method" + 0.006*"object" + 0.006*"array" + 0.006*"know" + 0.005*"work" + 0.005*"null"'),
 (4,
  '0.013*"public" + 0.009*"object" + 0.007*"system" + 0.006*"foo" + 0.006*"need" + 0.006*"work" + 0.006*"void" + 0.005*"method" + 0.005*"example" + 0.005*"call"')]

## tfidf LDA model

In [17]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [18]:
lda_model_tfidf.print_topics()

[(0,
  '0.002*"android" + 0.002*"div" + 0.002*"http" + 0.002*"error" + 0.002*"com" + 0.002*"run" + 0.002*"try" + 0.002*"application" + 0.002*"path" + 0.002*"import"'),
 (1,
  '0.002*"jquery" + 0.002*"form" + 0.002*"page" + 0.002*"script" + 0.002*"button" + 0.002*"input" + 0.002*"html" + 0.002*"element" + 0.002*"click" + 0.002*"text"'),
 (2,
  '0.002*"int" + 0.002*"item" + 0.002*"std" + 0.002*"vector" + 0.002*"test" + 0.001*"console" + 0.001*"var" + 0.001*"method" + 0.001*"round" + 0.001*"error"'),
 (3,
  '0.003*"array" + 0.003*"date" + 0.003*"int" + 0.003*"object" + 0.002*"number" + 0.002*"element" + 0.002*"type" + 0.002*"method" + 0.002*"model" + 0.002*"event"'),
 (4,
  '0.003*"foo" + 0.003*"int" + 0.003*"object" + 0.003*"method" + 0.003*"public" + 0.002*"type" + 0.002*"test" + 0.002*"one" + 0.002*"know" + 0.002*"static"')]

In [13]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [14]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [15]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [16]:
result.columns = ['bow','tdif','true_label']

In [17]:
result.true_label.unique

<bound method Series.unique of 0        javascript
1               c++
2        javascript
3            python
4        javascript
            ...    
30297            c#
30298          java
30299        python
30300          java
30301    javascript
Name: true_label, Length: 30302, dtype: object>

In [18]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

## bow:

* c++: topic 3
* javascript: topic 1
* c#: topic 4
* python:topic 2
* java: topic 0

In [19]:
result[mask1]['bow'].value_counts(normalize = True)

3    0.585762
4    0.162755
2    0.146247
0    0.091824
1    0.013412
Name: bow, dtype: float64

In [20]:
result[mask2]['bow'].value_counts(normalize = True)

1    0.387529
4    0.340870
3    0.172667
0    0.054868
2    0.044067
Name: bow, dtype: float64

In [21]:
result[mask3]['bow'].value_counts(normalize = True)

3    0.340968
4    0.259376
1    0.172774
2    0.139708
0    0.087174
Name: bow, dtype: float64

In [22]:
result[mask4]['bow'].value_counts(normalize = True)

3    0.343682
0    0.231642
1    0.166965
4    0.149055
2    0.108657
Name: bow, dtype: float64

In [23]:
result[mask5]['bow'].value_counts(normalize = True)

3    0.402410
0    0.211780
4    0.177510
1    0.144043
2    0.064257
Name: bow, dtype: float64

## tdif:

* c++: topic 4
* javascript: topic 0 
* c#: topic 2
* python:topic 1
* java: topic 3

In [24]:
result[mask1]['tdif'].value_counts(normalize = True)

4    0.702347
0    0.137477
2    0.116069
1    0.039721
3    0.004385
Name: tdif, dtype: float64

In [25]:
result[mask2]['tdif'].value_counts(normalize = True)

0    0.605703
4    0.265985
1    0.105415
2    0.019729
3    0.003168
Name: tdif, dtype: float64

In [26]:
result[mask3]['tdif'].value_counts(normalize = True)

4    0.555397
0    0.314200
2    0.070713
1    0.053822
3    0.005869
Name: tdif, dtype: float64

In [27]:
result[mask4]['tdif'].value_counts(normalize = True)

4    0.540896
0    0.319403
2    0.096318
1    0.040000
3    0.003383
Name: tdif, dtype: float64

In [28]:
result[mask5]['tdif'].value_counts(normalize = True)

4    0.532262
0    0.277108
2    0.137483
1    0.050067
3    0.003079
Name: tdif, dtype: float64

c++: topic 4
* javascript: topic 0 
* c#: topic 2
* python:topic 1
* java: topic 3

In [30]:
bow_map = {0:'java',3:'c++',1:'javascript',4:'c#',2:'python'}
tdif_map = {0:'javascript',4:'c++',2:'c#',1:'python',3:'java'}

result.bow = result.bow.map(bow_map)
result.tdif = result.tdif.map(tdif_map)

In [31]:
result

Unnamed: 0,bow,tdif,true_label
0,c++,c++,javascript
1,c++,c#,c++
2,javascript,javascript,javascript
3,c++,c++,python
4,c++,c++,javascript
5,c++,c++,javascript
6,c#,javascript,javascript
7,c++,javascript,javascript
8,c++,c++,java
9,c++,c++,c++


# confusion matrix bow

In [32]:
confusion_matrix(result.true_label,result.bow)

array([[1812, 2382,  609, 1207,  976],
       [ 631, 2271,  356,   52,  567],
       [1326, 3006, 1582, 1076,  480],
       [2367, 1199,  381, 2691,  306],
       [ 749, 1727, 1164,  839,  546]])

In [33]:
accuracy_score(result.true_label,result.bow)

0.293775988383605

In [34]:
confusion_matrix(result.true_label, result.tdif)

array([[ 494, 3880,   41, 2195,  376],
       [ 450, 2723,   17,  533,  154],
       [1027, 3976,   23, 2070,  374],
       [ 137, 1847,   22, 4206,  732],
       [ 484, 2718,   17, 1605,  201]])

In [35]:
accuracy_score(result.true_label,result.tdif)

0.2523595802257277