In [1]:
import pandas as pd
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,adjusted_rand_score,calinski_harabaz_score
import numpy as np
import warnings
from sklearn.cluster import KMeans, AgglomerativeClustering
import pickle
warnings.filterwarnings('ignore')

from gensim import corpora, models

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']

In [4]:
# load in best tuned models

with open('gs_lg','rb') as f:
    t_lg = pickle.load(f)

In [5]:
# load in best tuned models

with open('gs_nb','rb') as f:
    t_nb = pickle.load(f)

In [6]:
def coef_features(modelname, lg = True):
    labels = modelname.classes_  # label 
    if lg:
        coefs = modelname.best_estimator_['clf'].coef_
    else:
        coefs = modelname.best_estimator_['clf'].feature_log_prob_
    featurenames = modelname.best_estimator_['vect'].get_feature_names()
    coef_dict={}
    for i, l in enumerate(labels):
        coef_dict[l]=[]
        for c, f in zip(coefs[i],featurenames):
            if c:
                coef_dict[l].append((f,c))
    return coef_dict
    
    
    

In [7]:
lg_coef_f = coef_features(t_lg)
nb_coef_f = coef_features(t_nb)

In [8]:
def top_10_feature(coef_dict):
    top_10 = {}
    for l in coef_dict.keys():
        top_10[l] = sorted(coef_dict[l], key = lambda x: x[1], reverse = True)[:10]
        top_10[l] = [x[0] for x in top_10[l]]
    return pd.DataFrame(top_10)

In [9]:
pd.concat([top_10_feature(lg_coef_f),top_10_feature(nb_coef_f)], axis = 1)

    

Unnamed: 0,c#,c++,java,javascript,python,c#.1,c++.1,java.1,javascript.1,python.1
0,writeline,cout,jvm,jquery,django,string,std,string,function,list
1,net,std,jdk,backbone,py,use,int,class,jquery,file
2,msdn,boost,println,console log,numpy,public,use,use,var,use
3,ienumerable,qt,spring,prototype,def,new,const,method,use,print
4,script jquery,stl,jsp,angularjs,pythonic,class,function,public,div,py
5,streamreader,cpp,system println,jsfiddle,matplotlib,get,class,file,page,like
6,linq,int main,hibernate,angular,pep,net,code,new,html,way
7,window form,header file,jar,alert,urllib,method,vector,get,like,self
8,winforms,gcc,util,browser,beautifulsoup,code,include,code,script,import
9,entity framework,pointer,jdbc,ecmascript,typeerror,list,compiler,like,element,get


## feature engineering for clustering

In [29]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(train_X)

In [36]:
# remove extreme high or low counts 
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [37]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [38]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [39]:
corpus_tfidf = tfidf[bow]

In [40]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [41]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.014*"list" + 0.013*"file" + 0.013*"function" + 0.011*"new" + 0.011*"id" + 0.010*"var" + 0.010*"data" + 0.010*"div" + 0.009*"type" + 0.009*"class"'),
 (1,
  '0.020*"class" + 0.017*"use" + 0.016*"public" + 0.012*"method" + 0.012*"code" + 0.011*"new" + 0.010*"get" + 0.009*"like" + 0.008*"string" + 0.007*"would"'),
 (2,
  '0.015*"use" + 0.012*"value" + 0.011*"org" + 0.011*"like" + 0.010*"string" + 0.008*"get" + 0.008*"want" + 0.007*"way" + 0.007*"would" + 0.007*"work"'),
 (3,
  '0.016*"use" + 0.014*"file" + 0.012*"name" + 0.010*"get" + 0.008*"error" + 0.008*"c" + 0.008*"user" + 0.007*"version" + 0.007*"lib" + 0.007*"py"'),
 (4,
  '0.019*"int" + 0.017*"c" + 0.017*"x" + 0.017*"use" + 0.012*"return" + 0.011*"std" + 0.011*"string" + 0.011*"function" + 0.008*"code" + 0.008*"value"')]

## tfidf LDA model

In [42]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [43]:
lda_model_tfidf.print_topics()

[(0,
  '0.005*"c" + 0.005*"file" + 0.005*"use" + 0.004*"class" + 0.004*"project" + 0.004*"library" + 0.003*"application" + 0.003*"object" + 0.003*"code" + 0.003*"like"'),
 (1,
  '0.006*"file" + 0.006*"string" + 0.005*"x" + 0.005*"list" + 0.004*"image" + 0.004*"int" + 0.004*"function" + 0.004*"use" + 0.003*"test" + 0.003*"jar"'),
 (2,
  '0.009*"int" + 0.007*"c" + 0.007*"string" + 0.007*"array" + 0.007*"class" + 0.006*"b" + 0.006*"public" + 0.006*"foo" + 0.005*"list" + 0.005*"object"'),
 (3,
  '0.006*"div" + 0.005*"page" + 0.005*"function" + 0.005*"text" + 0.005*"id" + 0.005*"event" + 0.005*"form" + 0.005*"html" + 0.005*"string" + 0.004*"value"'),
 (4,
  '0.005*"string" + 0.005*"x" + 0.004*"date" + 0.004*"file" + 0.004*"c" + 0.004*"class" + 0.004*"use" + 0.004*"number" + 0.004*"thread" + 0.003*"get"')]

In [49]:
pred = []
for i in bow[:5]:
    result = lda_model[i]
    if len(result) > 1:
        pred.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred.append(result[0][0])
        
    

In [52]:
pred_tdif = []
for i in bow[:5]:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [61]:
pred, train_y[:5]

([4, 4, 0, 4, 1], 1681           java
 15021    javascript
 1126             c#
 3969     javascript
 23286          java
 Name: target, dtype: object)

In [60]:
lda_model[bow[4]]

[(0, 0.011356025),
 (1, 0.49925596),
 (2, 0.011427081),
 (3, 0.011286466),
 (4, 0.46667442)]