In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import numpy as np
import warnings
from gensim import corpora, models
from sklearn.metrics import confusion_matrix, accuracy_score
warnings.filterwarnings('ignore')
import re
%matplotlib inline

In [2]:
posts_df = pd.read_pickle('../data/interum/text_target.pkl')

In [3]:
# convert into features and target 
feature = posts_df['tokens']
label = posts_df['target']  # label 

In [4]:
# more words to remove and remove words longer than length 2
# words = ['class', 'file', 'use', 'code',
#          'string', 'function', 'new', 'like', 
#          'way', 'would', 'name', 'get', 'list',
#          'want', 'value','return','data']
feature = feature.apply(
    lambda x: [w for w in x if len(w) > 2])

## feature engineering for clustering

In [42]:
# dictionary for train 
dictionary = gensim.corpora.Dictionary(feature)
len(dictionary)

66674

In [43]:
dictionary.filter_extremes(no_below =1, no_above=0.05, keep_n=50000)
len(dictionary)

50000

In [44]:
# create bag of words 
bow = [dictionary.doc2bow(doc) for doc in feature]


In [45]:
# tfidf for bow 
tfidf = models.TfidfModel(bow)


In [46]:
corpus_tfidf = tfidf[bow]

In [47]:
# generate a model for bag of words 
# we know we have 5 tops 
lda_model = gensim.models.LdaMulticore(
    bow, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [62]:
lda_model.print_topics() # notice many filer words 

[(0,
  '0.005*"byte" + 0.005*"long" + 0.004*"thread" + 0.004*"char" + 0.004*"size" + 0.004*"path" + 0.003*"event" + 0.003*"script" + 0.003*"click" + 0.003*"length"'),
 (1,
  '0.009*"android" + 0.009*"script" + 0.008*"item" + 0.007*"model" + 0.007*"date" + 0.006*"div" + 0.005*"field" + 0.004*"url" + 0.004*"app" + 0.004*"content"'),
 (2,
  '0.013*"std" + 0.006*"foo" + 0.005*"event" + 0.004*"option" + 0.004*"template" + 0.004*"vector" + 0.003*"message" + 0.003*"select" + 0.003*"scope" + 0.003*"const"'),
 (3,
  '0.010*"org" + 0.007*"key" + 0.007*"foo" + 0.007*"import" + 0.007*"self" + 0.007*"lib" + 0.006*"print" + 0.006*"log" + 0.005*"xml" + 0.004*"apache"'),
 (4,
  '0.005*"log" + 0.005*"source" + 0.004*"module" + 0.004*"framework" + 0.004*"org" + 0.004*"json" + 0.004*"app" + 0.004*"eclipse" + 0.004*"private" + 0.004*"load"')]

In [63]:
topics_lda=lda_model.print_topics()

In [64]:
topics_lda_dict = {}
for topic in topics_lda:
    topics_lda_dict[topic[0]] = re.findall('[a-z]+',topic[1])

In [65]:
lda_df = pd.DataFrame(topics_lda_dict)
lda_df.columns = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']

In [66]:
lda_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
0,byte,android,std,org,log
1,long,script,foo,key,source
2,thread,item,event,foo,module
3,char,model,option,import,framework
4,size,date,template,self,org
5,path,div,vector,lib,json
6,event,field,message,print,app
7,script,url,select,log,eclipse
8,click,app,scope,xml,private
9,length,content,const,apache,load


## tfidf LDA model

In [67]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4,random_state=42)

In [53]:
lda_model_tfidf.print_topics()

[(0,
  '0.002*"byte" + 0.002*"script" + 0.002*"event" + 0.002*"long" + 0.002*"thread" + 0.002*"image" + 0.002*"div" + 0.001*"language" + 0.001*"maven" + 0.001*"convert"'),
 (1,
  '0.004*"date" + 0.003*"item" + 0.002*"script" + 0.002*"form" + 0.002*"model" + 0.002*"button" + 0.002*"click" + 0.002*"div" + 0.002*"field" + 0.002*"url"'),
 (2,
  '0.005*"std" + 0.003*"foo" + 0.003*"vector" + 0.002*"template" + 0.002*"date" + 0.002*"convert" + 0.002*"datetime" + 0.002*"character" + 0.002*"event" + 0.002*"cout"'),
 (3,
  '0.004*"foo" + 0.003*"print" + 0.003*"key" + 0.003*"bar" + 0.003*"log" + 0.002*"import" + 0.002*"hello" + 0.002*"self" + 0.002*"xml" + 0.002*"dictionary"'),
 (4,
  '0.002*"thread" + 0.002*"json" + 0.002*"private" + 0.002*"log" + 0.001*"constructor" + 0.001*"console" + 0.001*"arraylist" + 0.001*"database" + 0.001*"app" + 0.001*"load"')]

In [68]:
topics_tfidf=lda_model_tfidf.print_topics()

In [69]:
topics_tfidf_dict = {}
for topic in topics_tfidf:
    topics_tfidf_dict[topic[0]] = re.findall('[a-z]+',topic[1])

In [70]:
tfidf_df = pd.DataFrame(topics_tfidf_dict)
tfidf_df.columns = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']

In [71]:
tfidf_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
0,byte,date,std,foo,thread
1,script,item,foo,key,json
2,event,android,vector,print,arraylist
3,image,script,date,log,log
4,long,click,template,bar,div
5,thread,button,convert,import,private
6,maven,form,datetime,hello,constructor
7,bit,model,character,self,database
8,language,div,const,console,load
9,div,field,cout,xml,console


In [72]:
pred_bow = []
for i in bow:
    result = lda_model[i]
    if len(result) > 1:
        pred_bow.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_bow.append(result[0][0])    

In [73]:
pred_tdif = []
for i in bow:
    result = lda_model_tfidf[i]
    if len(result) > 1:
        pred_tdif.append(sorted(result, key=lambda x: x[1], reverse = True )[0][0])
    else:
        pred_tdif.append(result[0][0])
        

In [74]:
result = pd.DataFrame([pred_bow,pred_tdif,label]).T

In [75]:
result.columns = ['bow','tdif','true_label']

In [90]:
result.true_label.unique

<bound method Series.unique of 0        javascript
1               c++
2        javascript
3            python
4        javascript
            ...    
30297            c#
30298          java
30299        python
30300          java
30301    javascript
Name: true_label, Length: 30302, dtype: object>

In [91]:
mask1 = result.true_label =='c++'
mask2 = result.true_label == 'javascript'
mask3 = result.true_label =='c#'
mask4= result.true_label == 'python'
mask5 = result.true_label =='java'

## bow:

### best guesses with labels

* c++: topic 2
* javascript: topic 1
* c#: topic 4
* python:topic 0
* java: topic 3

In [92]:
result[mask1]['bow'].value_counts(normalize = True)

2    0.379159
4    0.342017
3    0.209182
0    0.061388
1    0.008254
Name: bow, dtype: float64

In [93]:
result[mask2]['bow'].value_counts(normalize = True)

1    0.372552
3    0.214574
0    0.187212
4    0.129896
2    0.095766
Name: bow, dtype: float64

In [94]:
result[mask3]['bow'].value_counts(normalize = True)

4    0.371887
3    0.237332
2    0.152591
1    0.121386
0    0.116805
Name: bow, dtype: float64

In [95]:
result[mask4]['bow'].value_counts(normalize = True)

4    0.306468
0    0.244179
3    0.240995
2    0.128756
1    0.079602
Name: bow, dtype: float64

In [96]:
result[mask5]['bow'].value_counts(normalize = True)

4    0.336412
3    0.223159
0    0.185944
1    0.147256
2    0.107229
Name: bow, dtype: float64

## tdif:

### best guesses with labels

* c++: topic 4
* javascript: topic 3
* c#: topic 1
* python:topic 2
* java: topic 0

In [97]:
result[mask1]['tdif'].value_counts(normalize = True)

4    0.862265
3    0.072995
0    0.046170
2    0.015476
1    0.003095
Name: tdif, dtype: float64

In [98]:
result[mask2]['tdif'].value_counts(normalize = True)

3    0.404234
4    0.365207
0    0.129176
1    0.093750
2    0.007632
Name: tdif, dtype: float64

In [99]:
result[mask3]['tdif'].value_counts(normalize = True)

4    0.572574
3    0.275265
0    0.109218
1    0.027627
2    0.015316
Name: tdif, dtype: float64

In [100]:
result[mask4]['tdif'].value_counts(normalize = True)

4    0.566368
3    0.208358
0    0.198209
1    0.016318
2    0.010746
Name: tdif, dtype: float64

In [101]:
result[mask5]['tdif'].value_counts(normalize = True)

4    0.574163
0    0.216734
3    0.173896
1    0.023025
2    0.012182
Name: tdif, dtype: float64

In [102]:
bow_map = {3:'java',2:'c++',1:'javascript',4:'c#',0:'python'}
tdif_map = {3:'javascript',4:'c++',1:'c#',2:'python',0:'java'}

result.bow = result.bow.map(bow_map)
result.tdif = result.tdif.map(tdif_map)



# confusion matrix bow

In [106]:
confusion_matrix(result.true_label,result.bow)

array([[2598, 1066, 1658,  848,  816],
       [1326, 1470,  811,   32,  238],
       [2513,  801, 1667, 1100, 1389],
       [ 902,  665, 1490, 2587, 1300],
       [1540,  647, 1211,  400, 1227]])

In [103]:
accuracy_score(result.true_label,result.bow)

0.31512771434228765

In [105]:
confusion_matrix(result.true_label, result.tdif)

array([[ 193, 4000,  763, 1923,  107],
       [  12, 3343,  179,  283,   60],
       [ 172, 4289, 1619, 1299,   91],
       [ 651, 2536,  897, 2807,   53],
       [  82, 2846,  996, 1047,   54]])

In [104]:
accuracy_score(result.true_label,result.tdif)

0.26453699425780475

### combine model to original df and look at text

In [76]:
combined_df = pd.concat([posts_df[['text']],result], axis = 1)

### function to look at each topic separately

In [94]:
def text_topic(df, model,topic):
    '''
    input:
    df: raw text, lda model, and topic num
    returns:
    random text for that topic 
    '''
    texts = df[df[model]==topic].text
    inds = df[df[model]==topic].text.index
    ind = np.random.choice(inds)
    return texts[ind]
    

## Topic 0

In [95]:
text_topic(combined_df,'bow',4)

'\n\nMy requirement is I have server J2EE web application and client J2EE web\napplication. Sometimes client can go offline. When client comes online he\nshould be able to synchronize changes to and fro. Also I should be able to\ncontrol which rows/tables need to be synchronized based on some filters/rules.\nIs there any existing Java frameworks for doing it? If I need to implement on\nmy own what are the different strategies that you can suggest?\n\nOne solution in my mind is maintaining sql logs and executing same statements\nat other side during synchronization. Do you see any problems with this\nstrategy?\n\n'

In [91]:
text_topic(combined_df,'bow',4)

"\n\nI am using Java 7 SDK and IntelliJ IDEA IDE.\n\n    \n    \n    java version 1.7.0_11 Java(TM) SE Runtime Environment (build 1.7.0_11-b21) Java HotSpot(TM) 64-Bit Server VM (build 23.6-b04 mixed mode) \n\nI am still not able to use Java 7 features. After a bit of googling I could\nuse all the features after setting project language level to `7(Diamond ARM\nmulticatch etc)`. What exactly is this? If this has some relationship to\nsyntax based on JDK in use what is `level 8(Lambda annotations etc)`? Java 8\nisn't released yet. Java 8 is expected in March 2014 according to Wiki.\nSomeone please explain this language level concept.\n\n"

In [78]:
combined_df[combined_df.bow==0].text.iloc[2000]



In [79]:
combined_df[combined_df.bow==0].text.iloc[500]

'\n\nCurrently I am working on a python project that contains sub modules and uses\nnumpy/scipy. Ipython is used as interactive console. Unfortunately I am not\nvery happy with workflow that I am using right now I would appreciate some\nadvice.\n\nIn IPython the framework is loaded by a simple `import` command. However it is\noften necessary to change code in one of the submodules of the framework. At\nthis point a model is already loaded and I use IPython to interact with it.\n\nNow the framework contains many modules that depend on each other i.e. when\nthe framework is initially loaded the main module is importing and configuring\nthe submodules. The changes to the code are only executed if the module is\nreloaded using `reload(main_mod.sub_mod)`. This is cumbersome as I need to\nreload all changed modules individually using the full path. It would be very\nconvenient if `reload(main_module)` would also reload all sub modules but\nwithout reloading numpy/scipy..\n\n'

## Topic 1:

In [80]:
combined_df[combined_df.bow==1].text.iloc[10]

'\n\nI have two radio buttons and want to post the value of the selected one. How\ncan I get the value with jQuery?\n\nI can get all of them like this:\n\n    \n    \n    $( form :radio ) \n\nHow do I know which one is selected?\n\n'

In [81]:
combined_df[combined_df.bow==1].text.iloc[2001]

"\n\nI have used this in my html page...\n\n    \n    \n    <script> window.fbAsyncInit = function() { // init the FB JS SDK FB.init({ appId : 'xxxxxxxxxxxxxx' // App ID from the App Dashboard status : true // check the login status upon init? cookie : true // set sessions cookies to allow your server to access the session? xfbml : true // parse XFBML tags on this page? }); // Additional initialization code such as adding Event Listeners goes here }; // Load the SDK's source Asynchronously // Note that the debug version is being actively developed and might // contain some type checks that are overly strict. // Please report such bugs using the bugs tool. (function(d debug){ var js id = 'facebook-jssdk' ref = d.getElementsByTagName('script')[0]; if (d.getElementById(id)) {return;} js = d.createElement('script'); js.id = id; js.async = true; js.src = //connect.facebook.net/en_US/all + (debug ? /debug : ) + .js ; ref.parentNode.insertBefore(js ref); }(document /*debug*/ false)); function

In [82]:
combined_df[combined_df.bow==1].text.iloc[456]

"\n\nI'm trying to set the `Content-Type` header of an `HttpClient` object as\nrequired by an API I am calling.\n\nI tried setting the `Content-Type` like below:\n\n    \n    \n    using (var httpClient = new HttpClient()) { httpClient.BaseAddress = new Uri( http://example.com/ ); httpClient.DefaultRequestHeaders.Add( Accept  application/json ); httpClient.DefaultRequestHeaders.Add( Content-Type  application/json ); // ... } \n\nIt allows me to add the `Accept` header but when I try to add `Content-Type`\nit throws the following exception:\n\n> Misused header name. Make sure request headers are used with\n`HttpRequestMessage` response headers with `HttpResponseMessage` and content\nheaders with `HttpContent` objects.\n\nHow can I set the `Content-Type` header in a `HttpClient` request?\n\n"

## Topic 2:

In [83]:
combined_df[combined_df.bow==2].text.iloc[2001]

'\n\nIs it wrong to write:\n\n    \n    \n    class A { public: virtual ~A() = 0; }; \n\nfor an abstract base class?\n\nAt least that compiles in MSVC... Will it crash at run time?\n\n'

In [84]:
combined_df[combined_df.bow==2].text.iloc[201]

"\n\nI'm writing a program in C# that needs to repeatedly access 1 image file. Most\nof the time it works but if my computer's running fast it will try to access\nthe file before it's been saved back to the filesystem and throw an error:\n_File in use by another process_.\n\nI would like to find a way around this but all my Googling has only yielded\ncreating checks by using exception handling. This is against my religion so I\nwas wondering if anyone has a better way of doing it?\n\n"

In [85]:
combined_df[combined_df.bow==2].text.iloc[45]

'\n\nUsing jQuery how can I **cancel/abort an Ajax request** that I have not yet\nreceived the response from?\n\n'