In [1]:
import numpy as np
import pandas as pd

In [2]:
so_data = pd.read_csv("../data/raw/stack_overflow_parsed_text.zip")
label_data = pd.read_csv("../data/raw/stack_overfow_tags.zip")


In [10]:
so_data.head()

Unnamed: 0,id,body,tags,text
0,11227809,<p>Here is a piece of C++ code that seems very...,java|c++|performance|optimization|branch-predi...,\n\nHere is a piece of C++ code that seems ver...
1,927358,<p>I committed the wrong files to Git.</p> <p>...,git|git-commit|git-reset|git-revert,\n\nI committed the wrong files to Git.\n\nHow...
2,179123,<p>I wrote the wrong thing in a commit message...,git|git-commit|git-rewrite-history|amend,\n\nI wrote the wrong thing in a commit messag...
3,2003505,<p>I want to delete a branch both locally and ...,git|github|git-branch|git-remote,\n\nI want to delete a branch both locally and...
4,477816,<p>I've been messing around with <a href= http...,json|content-type,\n\nI've been messing around with JSON for som...


In [9]:
label_data.head(10)

Unnamed: 0,tag,count
0,java,7562
1,c#,7166
2,javascript,6963
3,android,5729
4,python,5093
5,c++,3961
6,.net,3360
7,jquery,3306
8,html,2771
9,php,2726


## Select the top tags that we want for classifiers and make target

In this case, we are looking at the most common tags, and selecting from those just languages.

With our tags, loop through each row and apply a target if a label matches part of the tag.

In [13]:
def make_target(tag):
    current_label = ""
    top_5 = ["java", "c#", "javascript", "python", "c++"]
    
    for label in top_5:
        if label in tag:
            if current_label == "":
                current_label += label
            else: 
                current_label += "|" + label
    
    if current_label == "":
        return np.NaN
    
    if current_label == "java|javascript":
        return "javascript"
    
    return current_label

so_data["target"] = so_data.tags.apply(make_target)

In [15]:
so_data.dropna(inplace=True)

In [18]:
so_data

Unnamed: 0,id,body,tags,text,target
0,11227809,<p>Here is a piece of C++ code that seems very...,java|c++|performance|optimization|branch-predi...,\n\nHere is a piece of C++ code that seems ver...,java|c++
6,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript
7,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++
8,503093,<p>How can I redirect the user from one page t...,javascript|jquery|redirect,\n\nHow can I redirect the user from one page ...,javascript
10,231767,<p>What is the use of the <code>yield</code> k...,python|iterator|generator|yield|coroutine,\n\nWhat is the use of the `yield` keyword in ...,python
12,1789945,<p>How can I check if one string contains anot...,javascript|string|string-matching,\n\nHow can I check if one string contains ano...,javascript
13,1335851,<p>Recently I ran some of my JavaScript code t...,javascript|syntax|jslint|use-strict,\n\nRecently I ran some of my JavaScript code ...,javascript
14,178325,<p>In jQuery it is possible to toggle the visi...,javascript|jquery|dom|visibility,\n\nIn jQuery it is possible to toggle the vis...,javascript
15,14994391,<p>Suppose I'm familiar with developing client...,javascript|jquery|angularjs|design,\n\nSuppose I'm familiar with developing clien...,javascript
16,6841333,<p>If I run the following program which parses...,java|date|timezone,\n\nIf I run the following program which parse...,java


## check to see where any overlap is

We want to make sure that there is no overlap in our tags as that could throw off our results.

In [17]:
so_data.target.value_counts()

java                      7470
c#                        6986
javascript                6944
python                    5025
c++                       3877
java|c#                    100
c#|c++                      52
java|c++                    51
java|c#|javascript          45
java|python                 40
python|c++                  34
java|javascript|python      32
java|c#|c++                 11
c#|python                   10
java|javascript|c++          4
java|c#|python               3
java|python|c++              1
Name: target, dtype: int64

We will drop any of the overlapping targets listed above because we will still have enough data for our model training.

In [19]:
so_data.target = so_data.target.apply(lambda label: np.NaN if "|" in label else label)

In [20]:
so_data.dropna(inplace=True)

In [26]:
so_data.target.value_counts(normalize=True)

java          0.246518
c#            0.230546
javascript    0.229160
python        0.165831
c++           0.127945
Name: target, dtype: float64

## Save our labeled data

In [25]:
so_data.to_csv("../data/interum/stack_overflow_with_targets.csv", index=False)