In [1]:
import nltk
import warnings
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# load data

posts_df = pd.read_csv('../data/interum/stack_overflow_with_targets.zip')

In [3]:
posts_df.head(2)

Unnamed: 0,id,body,tags,text,target
0,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript
1,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++


### preprocesss:
* converting all letters to lower or upper case
* converting numbers into words or removing numbers
* removing white spaces
* removing punctuations, accent marks and other diacritics

In [4]:
def text_preprocess(post):
    '''
    input:
    post: a string with symbols and punctuations 
    returns:
    cleaned post with all letters to lower, all numbers, white space, and symbols removed
    '''
    pattern = r'[^A-Za-z]+'  # anything that is not letter or space
    processed = re.sub(pattern, ' ', post).strip().lower()
    return processed

In [6]:
# clean up the text column for text_target
posts_df['cleaned_text'] = posts_df.text.apply(text_preprocess)

## tokenize and lemmatize 
* removing stop words, sparse terms, and particular words
* lemmatization 

In [8]:
# position tags function to word_net postition tag
def get_wordnet_pos(treebank_tag):
    '''
    input: 
    treebank_tag: position tag from treebank_tag from nltk.pos_tag output
    output:
    return:
    wordnet position tag 
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return

In [11]:
lemmatizer = WordNetLemmatizer()


def token_lemma(post):
    '''
    input:
    post: cleaned post from function text_preprocess
    returns:
    tokenized post with lemmatization with position tags
    stopwords and tags are removed 
    '''
    tokens = word_tokenize(post)
    # stopwords
    stop_words = set(stopwords.words('english'))  # make sure no repeats
    # remove stopwords and remove words that are explicit tags
    words_to_remove = set(posts_df.target.unique()).union(stop_words)
    # perform pos tag before stop word removal to include more context for pos tags
    tags = nltk.pos_tag(tokens)
    tags_word_net = [get_wordnet_pos(w[1]) for w in tags]
    lem_result = []  # only include nonstop words and target tags
    for i in range(len(tags_word_net)):
        if tags[i][0] in words_to_remove:  # don't lemmatize unneeded words
            continue
        if tags_word_net[i]:  # not none
            lem_result.append(lemmatizer.lemmatize(
                tags[i][0], tags_word_net[i]))
        else:
            lem_result.append(tags[i][0])
    return lem_result

In [13]:
# apply to text to tokenize and lemmatize
posts_df['tokens'] = posts_df.cleaned_text.apply(token_lemma)

In [14]:
posts_df.head()

Unnamed: 0,id,body,tags,text,target,cleaned_text,tokens
0,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript,how would you explain javascript closures to s...,"[would, explain, closure, someone, knowledge, ..."
1,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++,after reading hidden features and dark corners...,"[read, hidden, feature, dark, corner, c, stl, ..."
2,503093,<p>How can I redirect the user from one page t...,javascript|jquery|redirect,\n\nHow can I redirect the user from one page ...,javascript,how can i redirect the user from one page to a...,"[redirect, user, one, page, another, use, jquery]"
3,231767,<p>What is the use of the <code>yield</code> k...,python|iterator|generator|yield|coroutine,\n\nWhat is the use of the `yield` keyword in ...,python,what is the use of the yield keyword in python...,"[use, yield, keyword, example, try, understand..."
4,1789945,<p>How can I check if one string contains anot...,javascript|string|string-matching,\n\nHow can I check if one string contains ano...,javascript,how can i check if one string contains another...,"[check, one, string, contain, another, substri..."


In [15]:
# join the tokens into cleaned_text for feature engineering later
posts_df['cleaned_text'] = posts_df.tokens.apply(lambda x: ' '.join(x))

In [16]:
posts_df.head()

Unnamed: 0,id,body,tags,text,target,cleaned_text,tokens
0,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript,would explain closure someone knowledge concep...,"[would, explain, closure, someone, knowledge, ..."
1,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++,read hidden feature dark corner c stl comp lan...,"[read, hidden, feature, dark, corner, c, stl, ..."
2,503093,<p>How can I redirect the user from one page t...,javascript|jquery|redirect,\n\nHow can I redirect the user from one page ...,javascript,redirect user one page another use jquery,"[redirect, user, one, page, another, use, jquery]"
3,231767,<p>What is the use of the <code>yield</code> k...,python|iterator|generator|yield|coroutine,\n\nWhat is the use of the `yield` keyword in ...,python,use yield keyword example try understand code ...,"[use, yield, keyword, example, try, understand..."
4,1789945,<p>How can I check if one string contains anot...,javascript|string|string-matching,\n\nHow can I check if one string contains ano...,javascript,check one string contain another substring usu...,"[check, one, string, contain, another, substri..."


In [17]:
# save the data to a pickle file
posts_df.to_pickle('../data/interum/text_target.pkl')