In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
import re
from sklearn.feature_extraction.text import CountVectorizerb

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
data = pd.read_json("stackoverflow-data-idf.json", lines=True)

In [6]:
data.head()

Unnamed: 0,id,title,body,answer_count,comment_count,creation_date,last_activity_date,last_editor_display_name,owner_display_name,owner_user_id,post_type_id,score,tags,view_count,accepted_answer_id,favorite_count,last_edit_date,last_editor_user_id,community_owned_date
0,4821394,Serializing a private struct - Can it be done?,<p>I have a public class that contains a priva...,1,0,2011-01-27 20:19:13.563 UTC,2011-01-27 20:21:37.59 UTC,,,163534.0,1,0,c#|serialization|xml-serialization,296,,,,,
1,3367882,How do I prevent floated-right content from ov...,<p>I have the following HTML:</p>\n\n<pre><cod...,2,2,2010-07-30 00:01:50.9 UTC,2012-05-10 14:16:05.143 UTC,,,1190.0,1,2,css|overflow|css-float|crop,4121,3367943.0,0.0,2012-05-10 14:16:05.143 UTC,44390.0,
2,31682135,Gradle command line,<p>I'm trying to run a shell script with gradl...,0,2,2015-07-28 16:30:18.28 UTC,2015-07-28 16:32:15.117 UTC,,,1299158.0,1,1,bash|shell|android-studio|gradle,259,,,,,
3,20218536,Loop variable as parameter in asynchronous fun...,<p>I have an object with the following form.</...,1,1,2013-11-26 13:34:49.957 UTC,2013-11-26 15:07:50.8 UTC,,,642751.0,1,0,javascript|asynchronous|foreach|async.js,120,,1.0,2013-11-26 15:02:47.993 UTC,1333873.0,
4,19941459,Canot get the href value,<p>Hi I need to valid the href is empty or not...,5,1,2013-11-12 22:41:36.11 UTC,2013-11-12 23:48:34.67 UTC,,,819774.0,1,0,javascript,97,19941620.0,,2013-11-12 22:43:42.97 UTC,21886.0,


In [8]:
print("Schema:\n\n",data.dtypes)
print("Number of questions,columns=",data.shape)

Schema:

 id                            int64
title                        object
body                         object
answer_count                  int64
comment_count                 int64
creation_date                object
last_activity_date           object
last_editor_display_name     object
owner_display_name           object
owner_user_id               float64
post_type_id                  int64
score                         int64
tags                         object
view_count                    int64
accepted_answer_id          float64
favorite_count              float64
last_edit_date               object
last_editor_user_id         float64
community_owned_date         object
dtype: object
Number of questions,columns= (20000, 19)


In [11]:
def pre_process(text):
    text = text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [12]:
data["text"] = data["title"] + data["body"]
data["text"] = data["text"].apply(lambda x: pre_process(x))

In [13]:
data["text"][1]

'how do i prevent floated right content from overlapping main content i have the following html lt td class a gt lt img src images some_icon png alt some icon gt lt span gt some content that s waaaaaaaaay too long to fit in the allotted space but which can get cut off lt span gt lt td gt it should display as follows some content that s wa icon i have the following css td a span overflow hidden white space nowrap z index td a img display block float right z index when i resize the browser to cut off the text it cuts off at the edge of the lt td gt rather than before the lt img gt which leaves the lt img gt overlapping the lt span gt content i ve tried various padding and margin s but nothing seemed to work is this possible nb it s very difficult to add a lt td gt that just contains the lt img gt here if it were easy i d just do that '

In [15]:
def get_stop_words(stop_file_path):    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [16]:
stopwords=get_stop_words("stopwords.txt")

In [25]:
docs = data["text"].tolist()

In [26]:
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)
word_count_vector = cv.fit_transform(docs)



In [27]:
word_count_vector.shape

(20000, 10000)

In [28]:
list(cv.vocabulary_.keys())[:10]

['serializing',
 'private',
 'struct',
 'public',
 'class',
 'contains',
 'properties',
 'string',
 'serialize',
 'attempt']

In [32]:
tfid_transfomer = TfidfTransformer(smooth_idf=True, use_idf=True)

In [33]:
tfid_transfomer.fit(word_count_vector)

In [35]:
df_test=pd.read_json("stackoverflow-test.json",lines=True)
df_test['text'] = df_test['title'] + df_test['body']
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['text'].tolist()
docs_title=df_test['title'].tolist()
docs_body=df_test['body'].tolist()

In [36]:
feature_names = cv.get_feature_names()



In [39]:
doc = docs_test[0]

In [40]:
tf_idf_vector = tfid_transfomer.transform(cv.transform([doc]))

In [41]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [42]:
sorted_items = sort_coo(tf_idf_vector.tocoo())

In [43]:
keywords = extract_topn_from_vector(feature_names, sorted_items, 10)

In [45]:
print("Title")
print(docs_title[0])
print("\nBody")
print(docs_body[0])
print("\nKeywords")
for k in keywords:
    print(k,keywords[k])

Title
Integrate War-Plugin for m2eclipse into Eclipse Project

Body
<p>I set up a small web project with JSF and Maven. Now I want to deploy on a Tomcat server. Is there a possibility to automate that like a button in Eclipse that automatically deploys the project to Tomcat?</p>

<p>I read about a the <a href="http://maven.apache.org/plugins/maven-war-plugin/" rel="nofollow noreferrer">Maven War Plugin</a> but I couldn't find a tutorial how to integrate that into my process (eclipse/m2eclipse).</p>

<p>Can you link me to help or try to explain it. Thanks.</p>

Keywords
eclipse 0.593
war 0.317
integrate 0.281
maven 0.273
tomcat 0.27
project 0.239
plugin 0.214
automate 0.157
jsf 0.152
possibility 0.146
