In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import datetime

[nltk_data] Downloading package punkt to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<p style='font-size:20px'><b> Data preprocessing </b></p>
<p>

- The raw data available in the 'Title' & 'Body' columns is collection of text, paragraphs, code, links, etc that can't be directly used as input data for training.
- As part of data preprocessing, we will derive individual words/phrases from these documents that can be used as features for training the model.
- Considering the nature of the data, we can assume that most of the content in Title & Body paragraphs will have words to build tag association  

Following NLP preprocessing steps were applied to achieve the desired features/words in the Title & Body Paragraphs:

    1. Punctuation removal
    2. Lower case conversion
    3. Stopword removal
    4. Stemming

These steps are first applied to the title only to see the before & after at each step. Later it has been applied to the texts, paragraphs filtered in the body. 

The code segments in the Body contain important information in symbols like '#' , ';' & therefore we won't apply any preprocessing steps, but rather directly derive char n-grams from complete code.   
    
</p>

In [2]:
start_time = datetime.datetime.now()

# Load the trimmed data set of 1.25L datapoints for preprocessing
raw_data = pd.read_csv('E:/ai_projects/data_storage/Stack over flow tagging 5/dataset/trimmed.csv')

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:03.853485


In [3]:
start_time = datetime.datetime.now()

# Step1: Punctuation removal

def punct_removal(s):

    # Replacing all punctuation marks with space character
    punctuation_free = s.translate(str.maketrans(string.punctuation, len(string.punctuation)*' ', ''))
    return punctuation_free

raw_data['Title_punctfree'] = raw_data.Title.apply(punct_removal)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before punctuation free: {}".format(i+1, raw_data.Title[random_indices[i]]))
    print("{}b. After punctuation free: {}\n".format(i+1, raw_data.Title_punctfree[random_indices[i]]))
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)

Few examples:

1a. Before punctuation free: valgrind reports uninitialised value in sqlite3_step and sqlite3_prepare_v2
1b. After punctuation free: valgrind reports uninitialised value in sqlite3 step and sqlite3 prepare v2

2a. Before punctuation free: which is the best iPhone and Android Simulator for Kubuntu Linux?
2b. After punctuation free: which is the best iPhone and Android Simulator for Kubuntu Linux 

3a. Before punctuation free: Wiki Syntax for Indentation under Numbered List
3b. After punctuation free: Wiki Syntax for Indentation under Numbered List

4a. Before punctuation free: WebApp being more descriptive than WCF
4b. After punctuation free: WebApp being more descriptive than WCF

5a. Before punctuation free: Probability that at least one fails
5b. After punctuation free: Probability that at least one fails


Block execution time:  0:00:00.667668


In [4]:
start_time = datetime.datetime.now()

# Step2: Converting to lower case
raw_data['Title_lower'] = raw_data.Title_punctfree.apply(lambda s:s.lower())

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before lower case: {}".format(i+1, raw_data.Title_punctfree[random_indices[i]]))
    print("{}b. After lower case: {}\n".format(i+1, raw_data.Title_lower[random_indices[i]]))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

1a. Before lower case: mapping current location to destination location in googleMaps
1b. After lower case: mapping current location to destination location in googlemaps

2a. Before lower case: Django External App Installation
2b. After lower case: django external app installation

3a. Before lower case: HTTP 404 with Ajax ActionLink
3b. After lower case: http 404 with ajax actionlink

4a. Before lower case: First Class Functions
4b. After lower case: first class functions

5a. Before lower case: Static variable initialization 
5b. After lower case: static variable initialization 


Block execution time:  0:00:00.112803


In [5]:
start_time = datetime.datetime.now()

# Tokenizing the words
raw_data['Title_tokens'] = raw_data.Title_lower.apply(lambda s:nltk.tokenize.word_tokenize(s))

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Title_tokens', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before tokenizing: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After tokenizing: {}\n".format(i+1, raw_data.Title_tokens[random_indices[i]]))
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

1a. Before tokenizing: why not set class members to public in the first place 
1b. After tokenizing: ['why', 'not', 'set', 'class', 'members', 'to', 'public', 'in', 'the', 'first', 'place']

2a. Before tokenizing: setting up flash cs4 to use adobe air 2 6
2b. After tokenizing: ['setting', 'up', 'flash', 'cs4', 'to', 'use', 'adobe', 'air', '2', '6']

3a. Before tokenizing: unknown events firing when i use onkeyup event handler of javascript
3b. After tokenizing: ['unknown', 'events', 'firing', 'when', 'i', 'use', 'onkeyup', 'event', 'handler', 'of', 'javascript']

4a. Before tokenizing: help me rewrite this regex to not match tags with attributes 
4b. After tokenizing: ['help', 'me', 'rewrite', 'this', 'regex', 'to', 'not', 'match', 'tags', 'with', 'attributes']

5a. Before tokenizing: how to modify the properties of a directshow filter output pin on the fly 
5b. After tokenizing: ['how', 'to', 'modify', 'the', 'properties', 'of', 'a', 'directshow', 'filter', 'output', 'p

In [6]:
start_time = datetime.datetime.now()

# Step3: Stopword removal

# Carefully choose which stopwords to remove based on problem, e.g 'not' keyword would be important in some NLP modelling problems like sentiment detection
stopwords  = nltk.corpus.stopwords.words('english')

def stopword_removal(tokens):
    global stopwords
    filtered_tokens = [item for item in tokens if item not in stopwords]
    return filtered_tokens

raw_data['Title_stop_removed'] = raw_data.Title_tokens.apply(stopword_removal)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_punctfree', 'Title_lower', 'Title_tokens', 'Title_stop_removed', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before stopword removal: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After stopword removal: {}".format(i+1, raw_data.Title_stop_removed[random_indices[i]]))
    print("{}c. Tags: {}\n".format(i+1, raw_data.Tags_shortlist[random_indices[i]]))
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

1a. Before stopword removal: ios 5   navigation and tab bar not appear after presentmodalviewcontroller
1b. After stopword removal: ['ios', '5', 'navigation', 'tab', 'bar', 'appear', 'presentmodalviewcontroller']
1c. Tags: iphone ios ios5

2a. Before stopword removal: redim preserve in c  
2b. After stopword removal: ['redim', 'preserve', 'c']
2c. Tags: c# arrays

3a. Before stopword removal: xml node access
3b. After stopword removal: ['xml', 'node', 'access']
3c. Tags: c# xml linq

4a. Before stopword removal:  net remoting not working in windows 7
4b. After stopword removal: ['net', 'remoting', 'working', 'windows', '7']
4c. Tags: c# .net

5a. Before stopword removal: git  change origin of cloned submodule
5b. After stopword removal: ['git', 'change', 'origin', 'cloned', 'submodule']
5c. Tags: git github


Block execution time:  0:00:02.387564


In [7]:
start_time = datetime.datetime.now()

# Step4: Stemming

# Try other stemming algorithms like snowball, lancaster, etc & understand what is the difference?
porter_stem = nltk.stem.PorterStemmer()

def stem_token(tokens):
    
    global porter_stem
    stemmed_tokens = [porter_stem.stem(item) for item in tokens]
    return stemmed_tokens

raw_data['Title_stemmed'] = raw_data.Title_stop_removed.apply(stem_token)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_lower', 'Title_stemmed', 'Body', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before stemming: {}".format(i+1, raw_data.Title_lower[random_indices[i]]))
    print("{}b. After stemming: {}\n".format(i+1, raw_data.Title_stemmed[random_indices[i]]))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

1a. Before stemming: how can i replace all the nan values with zero s in a column of a pandas dataframe
1b. After stemming: ['replac', 'nan', 'valu', 'zero', 'column', 'panda', 'datafram']

2a. Before stemming: stepwise regression specific to variable transformations
2b. After stemming: ['stepwis', 'regress', 'specif', 'variabl', 'transform']

3a. Before stemming: mysql  load data from file  into number of tables
3b. After stemming: ['mysql', 'load', 'data', 'file', 'number', 'tabl']

4a. Before stemming: 404 error when uploading large files  4mb     asp net
4b. After stemming: ['404', 'error', 'upload', 'larg', 'file', '4mb', 'asp', 'net']

5a. Before stemming: problem with clearing a list t 
5b. After stemming: ['problem', 'clear', 'list']


Block execution time:  0:00:14.333943


In [8]:
# Since title are not enough take a look at body of few examples

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Title: {}".format(i+1, raw_data.Title[random_indices[i]]))
    print("{}b. Body: {}".format(i+1, raw_data.Body[random_indices[i]]))
    print("{}c. Tags: {}\n\n\n".format(i+1, raw_data.Tags_shortlist[random_indices[i]]))


Few examples:

1a. Title: Populate list view
1b. Body: <p>I am developing a final year project and I am stuck at a point. I have to retrieve names of doctors from my database (MySQL database) and show it in a list view. I was able to establish a connection with the server and retrieve values, but when I tried to show the values in a list view, the application crashed!</p>

<p>I tried the same example given in <em>[Hello, Views, List View][4]</em>.</p>

<p>It works for a predefined array like</p>

<pre><code>private String lv_arr[]={"Android","iPhone","BlackBerry","AndroidPeople"};
</code></pre>

<p>but for a string array retrieved from the database it shows a run time exception. Is there any way I can achieve this?</p>

<pre><code>package com.proj;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;

import 

In [9]:
start_time = datetime.datetime.now()

def body_filter(s):

    # Separate the code blocks from the body
    code_regex = r'<code>(.*?)</code>'
    code_blocks = re.findall(code_regex, s, flags=re.MULTILINE|re.DOTALL)
    s = re.sub(code_regex, ' ', s, flags=re.MULTILINE|re.DOTALL)

    # Remove html tags from the body
    # Note this will also remove hyper links (<a href="link">), which may contain valuable information
    html_regex = r'<(.*?)>'
    s = re.sub(html_regex, ' ', s, flags=re.MULTILINE|re.DOTALL)
    
    return [s, code_blocks]
    
    
temp = raw_data.Body.apply(body_filter)

raw_data['Body_text'] = [temp[i][0] for i in range(len(temp))]
raw_data['Body_code'] = [temp[i][1] for i in range(len(temp))]

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_text', 'Body_code', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("-------- Example {} --------\n".format(i+1))
    print("{}a. Body:\n {}".format(i+1, raw_data.Body[random_indices[i]]))
    print("\n\n{}b. Text:\n {}".format(i+1, raw_data.Body_text[random_indices[i]]))
    print("\n\n{}c. Code:\n {}\n\n\n\n".format(i+1, raw_data.Body_code[random_indices[i]]))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

-------- Example 1 --------

1a. Body:
 <p>I've got a huge (about half a GiB, impossible to use a usual text editor on) CSV file with fields enclosed in double quotes like <code>"abc","def"</code> but need a file without quotes (I am sure this is not going to break the file consistency - a comma is never used inside the values in it).</p>

<p>How to remove all the quotes (without introducing spaces on their places)?</p>



1b. Text:
  I've got a huge (about half a GiB, impossible to use a usual text editor on) CSV file with fields enclosed in double quotes like   but need a file without quotes (I am sure this is not going to break the file consistency - a comma is never used inside the values in it). 

 How to remove all the quotes (without introducing spaces on their places)? 



1c. Code:
 ['"abc","def"']




-------- Example 2 --------

2a. Body:
 <p>I have a console application which uploads jobs to the workers running in the cloud. The application connects to Azure 

In [10]:
start_time = datetime.datetime.now()

# Merging all fragmented code blocks in a datapoint

def merge_code(code_blocks):
    return " " + " ".join(code_blocks)

raw_data['Code'] = raw_data.Body_code.apply(merge_code)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_text', 'Body_code', 'Code', 'Tags_shortlist']]

print("Few examples:\n")
random_indices = np.random.randint(0, len(raw_data.Title), 5)

for i in range(5):
    print("{}a. Before: {}".format(i+1, raw_data.Body_code[random_indices[i]]))
    print("{}b. After: {}\n".format(i+1, raw_data.Code[random_indices[i]]))

print("\nBlock execution time: ", datetime.datetime.now() - start_time)    

Few examples:

1a. Before: []
1b. After:  

2a. Before: []
2b. After:  


    &lt;path id="aaa"&gt;
        &lt;path path="D:/dev/mci/admin/build/classes"/&gt;
    &lt;/path&gt;

    &lt;ac:pathtofileset name="aaa.fileset"
                   pathrefid="aaa"
                   dir="${basedir}"/&gt;
    &lt;zip destfile="${build.war.full.filename}"&gt;
        &lt;mappedresources&gt;
            &lt;restrict&gt;
                &lt;fileset refid="aaa.fileset"/&gt;
                &lt;type type="file"/&gt;
            &lt;/restrict&gt;
            &lt;globmapper from="*" to="WEB-INF/classes/*"/&gt;
        &lt;/mappedresources&gt;
    &lt;/zip&gt;
&lt;/target&gt;


4a. Before: []
4b. After:  

5a. Before: ['class Buttons\n{\n    private int _ID;\n    private string _name;\n    private string _text;\n    private String[] _layers;\n    public int ID \n    {\n        get { return _ID; }\n        set { _ID = value; }\n    }\n    public string Name \n    {\n        get { return _name; }\n     

In [11]:
start_time = datetime.datetime.now()

# Apply all the preprocessing steps to the body as applied to title

def body_preprocessing(s):

    # Step1: Punctuation removal
    punctuation_free = s.translate(str.maketrans('', '', string.punctuation))

    # Step2: Converting to lower case
    s_lower  = punctuation_free.lower()

    # Tokenizing the sentence
    tokens = nltk.tokenize.word_tokenize(s_lower)

    # Step3: Stopword removal
    global stopwords
    filtered_tokens = [item for item in tokens if item not in stopwords]

    # Step4: Stemming
    global porter_stem
    stemmed_tokens = [porter_stem.stem(item) for item in filtered_tokens]
    
    return stemmed_tokens


raw_data['Body_stemmed'] = raw_data.Body_text.apply(body_preprocessing)

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_stemmed', 'Body', 'Body_stemmed', 'Code', 'Tags_shortlist']]

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:03:17.144526


In [12]:
start_time = datetime.datetime.now()

# Merging the processed tokens to save the datapoint as a single raw string
# To provide more weightage to title since it would contain more relevant & crisp information, the final raw string has title added 3 times vs body tokens added once 

raw_data['Title_processed'] = [" ".join(item) for item in raw_data.Title_stemmed]
raw_data['Body_processed'] = [" ".join(item) for item in raw_data.Body_stemmed]
raw_data['Title3_Body'] = [raw_data.Title_processed[i] + " " + raw_data.Title_processed[i] + " " + raw_data.Title_processed[i] + " " + raw_data.Body_processed[i] for i in range(raw_data.shape[0])]

# Rearranging the dataset
raw_data = raw_data.loc[:, ['Id', 'Title', 'Title_processed', 'Body', 'Body_processed', 'Title3_Body', 'Code', 'Tags_shortlist']]

print("\nBlock execution time: ", datetime.datetime.now() - start_time)


Block execution time:  0:00:12.656673


In [13]:
start_time = datetime.datetime.now()

# Save data to csv file
#### Imp: Careful not to overwrite data ####

try:
    raw_data.to_csv('dataset/processed.csv', mode = 'x', index = False)
except:
    print("File already saved?")
    
print("\nBlock execution time: ", datetime.datetime.now() - start_time)    


Block execution time:  0:00:10.833708


In [14]:
start_time = datetime.datetime.now()

# Train CV test split: Split the data in training, CV and test

temp_data, test_data = train_test_split(raw_data, train_size = 125000)
train_data, cv_data = train_test_split(temp_data, train_size = 100000)

print(train_data.shape)
print(cv_data.shape)
print(test_data.shape)

print("\nBlock execution time: ", datetime.datetime.now() - start_time)

(100000, 8)
(25000, 8)
(25000, 8)

Block execution time:  0:00:00.618272


In [15]:
start_time = datetime.datetime.now()

try:
    train_data.to_csv('dataset/processed_train.csv', mode = 'x', index = False)
except:
    print("File already saved?")

try:
    cv_data.to_csv('dataset/processed_cv.csv', mode = 'x', index = False)
except:
    print("File already saved?")
    
    
try:
    test_data.to_csv('dataset/processed_test.csv', mode = 'x', index = False)
except:
    print("File already saved?")

print("\nBlock execution time: ", datetime.datetime.now() - start_time)    


Block execution time:  0:00:09.912370
