In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load the trimmed data set of 5L datapoints for preprocessing
raw_data = pd.read_csv('dataset 5L/trimmed.csv')

In [3]:
# Step1: Punctuation removal

def punct_removal(s):
    punctuation_free = s.translate(str.maketrans(string.punctuation, len(string.punctuation)*' ', ''))
    return punctuation_free

raw_data['Title_punctfree'] = raw_data.Title.apply(punct_removal)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before punctuation free: {}".format(i+1, raw_data.Title[random_indices[i]]))
    print("{}b. After punctuation free: {}\n".format(i+1, raw_data.Title_punctfree[random_indices[i]]))
    

Few examples:

1a. Before punctuation free: Grabbing an image from a page using jQuery
1b. After punctuation free: Grabbing an image from a page using jQuery

2a. Before punctuation free: Where can I find a good game development tutorial for iPhone?
2b. After punctuation free: Where can I find a good game development tutorial for iPhone 

3a. Before punctuation free: chrome extension to remove div on facebook
3b. After punctuation free: chrome extension to remove div on facebook

4a. Before punctuation free: S3 based dropbox like program for mac os and windows
4b. After punctuation free: S3 based dropbox like program for mac os and windows

5a. Before punctuation free: UIPicker and Text and URL
5b. After punctuation free: UIPicker and Text and URL



In [4]:
# Step2: Converting to lower case

raw_data['Title_lower'] = raw_data.Title_punctfree.apply(lambda s:s.lower())

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before lower case: {}".format(i+1, raw_data.Title_punctfree[random_indices[i]]))
    print("{}b. After lower case: {}\n".format(i+1, raw_data.Title_lower[random_indices[i]]))


Few examples:

1a. Before lower case: How prevalent is UTF 8 really 
1b. After lower case: how prevalent is utf 8 really 

2a. Before lower case: Update bidirectional ManyToMany from both sides
2b. After lower case: update bidirectional manytomany from both sides

3a. Before lower case: Facebook gives  Unsafe JavaScript attempt to access frame with URL  error in Chrome
3b. After lower case: facebook gives  unsafe javascript attempt to access frame with url  error in chrome

4a. Before lower case:  usr bin time  l acts strange
4b. After lower case:  usr bin time  l acts strange

5a. Before lower case: Email id in URL 
5b. After lower case: email id in url 



In [5]:
# Tokenizing the words

raw_data['Title_tokens'] = raw_data.Title_lower.apply(lambda s:nltk.tokenize.word_tokenize(s))

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Title_tokens', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before tokenizing: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After tokenizing: {}\n".format(i+1, raw_data.Title_tokens[random_indices[i]]))

Few examples:

1a. Before tokenizing: sql server 2000   asp net  login failed for user  nt authority anonymous logon 
1b. After tokenizing: ['sql', 'server', '2000', 'asp', 'net', 'login', 'failed', 'for', 'user', 'nt', 'authority', 'anonymous', 'logon']

2a. Before tokenizing: using spring by creating multiple applicationcontexts to manage bean life time   is this okay 
2b. After tokenizing: ['using', 'spring', 'by', 'creating', 'multiple', 'applicationcontexts', 'to', 'manage', 'bean', 'life', 'time', 'is', 'this', 'okay']

3a. Before tokenizing: writing an sql query to select item from the following table
3b. After tokenizing: ['writing', 'an', 'sql', 'query', 'to', 'select', 'item', 'from', 'the', 'following', 'table']

4a. Before tokenizing: core data inheritance  is it ok to do this 
4b. After tokenizing: ['core', 'data', 'inheritance', 'is', 'it', 'ok', 'to', 'do', 'this']

5a. Before tokenizing: multiple file upload option produces error rails
5b. After tokenizing: ['multiple',

In [6]:
# Step3: Stopword removal

# Carefully choose which stopwords to use based on problem, e.g 'not' keyword would be important in some NLP modelling problems like sentiment detection
stopwords  = nltk.corpus.stopwords.words('english')

def stopword_removal(tokens):
    global stopwords
    filtered_tokens = [item for item in tokens if item not in stopwords]
    return filtered_tokens

raw_data['Title_stop_removed'] = raw_data.Title_tokens.apply(stopword_removal)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Title_tokens', 'Title_stop_removed', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before stopword removal: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After stopword removal: {}".format(i+1, raw_data.Title_stop_removed[random_indices[i]]))
    print("{}c. Tags: {}\n".format(i+1, raw_data.Tags_shortlist[random_indices[i]]))
    

Few examples:

1a. Before stopword removal: my boot order changed  why 
1b. After stopword removal: ['boot', 'order', 'changed']
1c. Tags: hard-drive boot

2a. Before stopword removal: supporting ajax history button without going crazy
2b. After stopword removal: ['supporting', 'ajax', 'history', 'button', 'without', 'going', 'crazy']
2c. Tags: javascript jquery ajax jquery-ajax

3a. Before stopword removal: using r  replace all values in a matrix  0 1 with 0 
3b. After stopword removal: ['using', 'r', 'replace', 'values', 'matrix', '0', '1', '0']
3c. Tags: r

4a. Before stopword removal: whether there exists or not  a group of  2  elements  a  and  b  with   lvert b  rvert   2    lvert a rvert   15  and    
4b. After stopword removal: ['whether', 'exists', 'group', '2', 'elements', 'b', 'lvert', 'b', 'rvert', '2', 'lvert', 'rvert', '15']
4c. Tags: homework group-theory

5a. Before stopword removal: lsass exe starts maxing cpu after asp net web app deployment
5b. After stopword removal

In [7]:
# Step4: Stemming

# Try other stemming algorithms like snowball, lancaster, etc & understand what is the difference?
porter_stem = nltk.stem.PorterStemmer()

def stem_token(tokens):
    
    global porter_stem
    stemmed_tokens = [porter_stem.stem(item) for item in tokens]
    return stemmed_tokens

raw_data['Title_stemmed'] = raw_data.Title_stop_removed.apply(stem_token)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_lower', 'Title_stemmed', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before stemming: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After stemming: {}\n".format(i+1, raw_data.Title_stemmed[random_indices[i]]))


Few examples:

1a. Before stemming: supporting multiple languages in a winforms application
1b. After stemming: ['support', 'multipl', 'languag', 'winform', 'applic']

2a. Before stemming: what kind of value does android valueto represent
2b. After stemming: ['kind', 'valu', 'android', 'valueto', 'repres']

3a. Before stemming: finding start and end date for 2 conditions
3b. After stemming: ['find', 'start', 'end', 'date', '2', 'condit']

4a. Before stemming: install ubuntu 10 10 from loopback mounted iso image
4b. After stemming: ['instal', 'ubuntu', '10', '10', 'loopback', 'mount', 'iso', 'imag']

5a. Before stemming: resetting sql server 2008 login  no access using management studio 
5b. After stemming: ['reset', 'sql', 'server', '2008', 'login', 'access', 'use', 'manag', 'studio']



In [8]:
# Since title are not enough take a look at body of few examples

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Title: {}".format(i+1, raw_data.Title[random_indices[i]]))
    print("{}b. Body: {}".format(i+1, raw_data.Body[random_indices[i]]))
    print("{}c. Tags: {}\n\n\n".format(i+1, raw_data.Tags_shortlist[random_indices[i]]))


Few examples:

1a. Title: Transforming XSD Import for WSDL in Spring WS
1b. Body: <p>I'm tring to extend the example in chapter 5 of the Spring WS guide.  I'm using Spring WS 1.5.9 .</p>

<p>I've added ... </p>

<pre><code>&lt;import namespace="http://myco.com/schemas/promotion/v1_2"
    schemaLocation="http://localhost:8080/ordersService/Promotion_1_2.xsd" /&gt; ...
</code></pre>

<p>But spring doesn't appear to be transforming the location of the import like it does for the port.  So on my company website it still shows "localhost:8080"</p>

<p>I have the WSDL generation defined as such</p>

<pre><code>&lt;bean id="orders" class="org.springframework.ws.wsdl.wsdl11.DefaultWsdl11Definition"&gt;
    &lt;property name="schema" ref="schema" /&gt;
    &lt;property name="portTypeName" value="Orders" /&gt;
    &lt;property name="locationUri" value="http://localhost:8080/ordersService/" /&gt;
</code></pre>

<p></p>

<p>Does/Can Spring Framework transform the imports some way ???</p>

<p>UPDAT

In [9]:
def body_filter(s):

    # Separate the code blocks from the body
    code_regex = r'<code>(.*?)</code>'
    code_blocks = re.findall(code_regex, s, flags=re.MULTILINE|re.DOTALL)
    s = re.sub(code_regex, ' ', s, flags=re.MULTILINE|re.DOTALL)

    # Remove html tags from the body
    # Note this will also remove hyper links (<a href="link">), which may contain valuable information
    html_regex = r'<(.*?)>'
    s = re.sub(html_regex, ' ', s, flags=re.MULTILINE|re.DOTALL)
    
    return [s, code_blocks]
    
    
temp = raw_data.Body.apply(body_filter)

raw_data['Body_text'] = [temp[i][0] for i in range(len(temp))]
raw_data['Body_code'] = [temp[i][1] for i in range(len(temp))]

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_text', 'Body_code', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("-------- Example {} --------\n".format(i+1))
    print("{}a. Body:\n {}".format(i+1, raw_data.Body[random_indices[i]]))
    print("\n\n{}b. Text:\n {}".format(i+1, raw_data.Body_text[random_indices[i]]))
    print("\n\n{}c. Code:\n {}\n\n\n\n".format(i+1, raw_data.Body_code[random_indices[i]]))


Few examples:

-------- Example 1 --------

1a. Body:
 <p>I have an assembly that when accessed spins up a single thread to process items placed on a queue. In that assembly I attach a handler to the DomainUnload event:</p>

<p><code>AppDomain.CurrentDomain.DomainUnload += new EventHandler(CurrentDomain_DomainUnload);</code></p>

<p>That handler joins the thread to the main thread so that all items on the queue can complete processing before the application terminates.</p>

<p>The problem that I am experiencing is that the DomainUnload event is not getting fired when the console application terminates. Any ideas why this would be?</p>

<p>Using .NET 3.5 and C#</p>



1b. Text:
  I have an assembly that when accessed spins up a single thread to process items placed on a queue. In that assembly I attach a handler to the DomainUnload event: 

   

 That handler joins the thread to the main thread so that all items on the queue can complete processing before the application terminates. 

 

In [10]:
# Merging all fragmented code blocks in a adatapoint

def merge_code(code_blocks):
    
    return " " + " ".join(code_blocks)

raw_data['Code'] = raw_data.Body_code.apply(merge_code)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_text', 'Body_code', 'Code', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before: {}".format(i+1, raw_data.Body_code[random_indices[i]]))
    print("{}b. After: {}\n".format(i+1, raw_data.Code[random_indices[i]]))


Few examples:

1a. Before: ['http://example.co.uk/map.php becomes http://example.co.uk/map/\n', 'http://example.co.uk/index.php becomes http://example.co.uk/\n', 'Options +FollowSymLinks\nRewriteEngine On\nRewriteRule ^map/(.*)/$ map.php\n']
1b. After:  http://example.co.uk/map.php becomes http://example.co.uk/map/
 http://example.co.uk/index.php becomes http://example.co.uk/
 Options +FollowSymLinks
RewriteEngine On
RewriteRule ^map/(.*)/$ map.php


2a. Before: ['IEnumerable&lt;IEnumerable&gt;', 'IEnumerable&lt;IEnumerable&gt;', 'IEnumerable', 'IEnumerable']
2b. After:  IEnumerable&lt;IEnumerable&gt; IEnumerable&lt;IEnumerable&gt; IEnumerable IEnumerable

3a. Before: ['IP            Hostname   TransactionDate\n------------- ---------- -------------------\n1.1.1.1       A          2009-01-01 01:00:00\n1.1.1.1       A          2009-01-02 01:00:00\n1.1.1.1       A          2009-01-03 01:45:00\n1.1.1.1       B          2009-01-04 01:00:00\n1.1.1.1       A          2009-01-05 01:00:00\n', 

In [11]:
# Apply all the preprocessing steps to the body as applied to title

def body_preprocessing(s):

    # Step1: Punctuation removal
    punctuation_free = s.translate(str.maketrans('', '', string.punctuation))

    # Step2: Converting to lower case
    s_lower  = punctuation_free.lower()

    # Tokenizing the sentence
    tokens = nltk.tokenize.word_tokenize(s_lower)

    # Step3: Stopword removal
    global stopwords
    filtered_tokens = [item for item in tokens if item not in stopwords]

    # Step4: Stemming
    global porter_stem
    stemmed_tokens = [porter_stem.stem(item) for item in filtered_tokens]
    
    return stemmed_tokens


raw_data['Body_stemmed'] = raw_data.Body_text.apply(body_preprocessing)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_stemmed', 'Code', 'Tags_shortlist']]


In [12]:
# Merge tokens

raw_data['Title_processed'] = [" ".join(item) for item in raw_data.Title_stemmed]
raw_data['Body_processed'] = [" ".join(item) for item in raw_data.Body_stemmed]
raw_data['Title3_Body'] = [raw_data.Title_processed[i] + " " + raw_data.Title_processed[i] + " " + raw_data.Title_processed[i] + " " + raw_data.Body_processed[i] for i in range(raw_data.shape[0])]

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_processed', 'Body', 'Body_processed', 'Title3_Body', 'Code', 'Tags_shortlist']]


In [13]:
# Save data to csv file
#### Imp: Careful not to overwrite data ####

try:
    raw_data.to_csv('dataset 5L/processed.csv', mode = 'x', index = False)
except:
    print("File already saved?")

In [14]:
# Train CV test split: Split the data in training, CV and test

temp_data, test_data = train_test_split(raw_data, train_size = 0.83)
train_data, cv_data = train_test_split(temp_data, train_size = 0.8)

print(train_data.shape)
print(cv_data.shape)
print(test_data.shape)

(315915, 8)
(78979, 8)
(80883, 8)


In [15]:
try:
    train_data.to_csv('dataset 5L/processed_train.csv', mode = 'x', index = False)
except:
    print("File already saved?")

try:
    cv_data.to_csv('dataset 5L/processed_cv.csv', mode = 'x', index = False)
except:
    print("File already saved?")
    
    
try:
    test_data.to_csv('dataset 5L/processed_test.csv', mode = 'x', index = False)
except:
    print("File already saved?")
