In [17]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [18]:
# load data 

posts_df = pd.read_csv('../data/interum/stack_overflow_with_targets.zip')

In [19]:
posts_df.head(2)

Unnamed: 0,id,body,tags,text,target
0,111102,<p>How would you explain JavaScript closures t...,javascript|scope|closures,\n\nHow would you explain JavaScript closures ...,javascript
1,1642028,<p>After reading <a href= http://groups.google...,c++|c|operators|code-formatting|standards-comp...,\n\nAfter reading Hidden Features and Dark Cor...,c++


### preprocesss:
* converting all letters to lower or upper case
* converting numbers into words or removing numbers
* removing white spaces
* removing punctuations, accent marks and other diacritics

In [20]:
# keep necessary columns
text_target = posts_df[['text','target']]

In [21]:
def text_preprocess(post):
    '''
    input:
    post: a string with symbols and punctuations 
    returns:
    cleaned post with all letters to lower, all numbers, white space, and symbols removed
    '''
    pattern = r'[^A-Za-z]+' # anything that is not letter or space 
    processed = re.sub(pattern, ' ', post).strip().lower()
    return processed    

In [22]:
text_target['text'] = text_target.text.apply(text_preprocess)

* removing stop words, sparse terms, and particular words
* lemmatization 

In [23]:
# position tags function to word_net postition tag 
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 

In [24]:
lemmatizer = WordNetLemmatizer()
def token_lemma(post):
    '''
    input:
    post: cleaned post from function text_preprocess
    returns:
    tokenized post with lemmatization with position tags
    stopwords and tags are removed 
    '''
    tokens = word_tokenize(post)
    # stopwords
    stop_words = set(stopwords.words('english'))  # make sure no repeats
    # remove stopwords and remove words that are explicit tags
    words_to_remove = set(text_target.target.unique()).union(stop_words)
    # perform pos tag before stop word removal to include more context for pos tags 
    tags = nltk.pos_tag(tokens)
    tags_word_net = [get_wordnet_pos(w[1]) for w in tags]
    lem_result = []  # only include nonstop words and target tags 
    for i in range(len(tags_word_net)):
        if tags[i][0] in words_to_remove:  # don't lemmatize unneeded words 
            continue
        if tags_word_net[i]:  # not none 
            lem_result.append(lemmatizer.lemmatize(tags[i][0],tags_word_net[i]))
        else:
            lem_result.append(tags[i][0])
    return lem_result

In [26]:
# apply to text 
text_target['text'] = text_target.text.apply(token_lemma)

In [27]:
text_target.head()

Unnamed: 0,text,target
0,"[would, explain, closure, someone, knowledge, ...",javascript
1,"[read, hidden, feature, dark, corner, c, stl, ...",c++
2,"[redirect, user, one, page, another, use, jquery]",javascript
3,"[use, yield, keyword, example, try, understand...",python
4,"[check, one, string, contain, another, substri...",javascript


In [28]:
text_target.to_pickle('../data/interum/text_target.pkl')