##### In this notebook we load the dataset we considered in Preprocessing0.ipynb and perform a different preprocessing.
The structure of the notebook is similar with Preprocessinc0.ipynb, the main difference is in how the text from the original dataset is parsed.<br> <br>
In particular, in building the BagOfWord Model we filter the words in the vocabulary avoiding to match meaningless words (such as words without any alphabetical character or composed only by puntaction characters).<br> <br>
We also apply stemming with the PorterStemmer.<br> <br>

#### In addition (w.r.t. Preprocessinc1.ipynb) here we perform also StopWord Removal.
 Finally in ModelsComparison2.ipynb we will compare different models (in the same way we did in ModelsCoparison0.ipynb) but on the different BagOfWord model and check if the classifiers benefit of the different preproccessing.

In [1]:
# dataframe management
import pandas as pd             

# numerical computation
import numpy as np

# visualization library
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   




# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('./subdataset.csv')
dataset.head() #show the first n instances

Unnamed: 0.1,Unnamed: 0,qid,question_text,target
0,424368,532eafa4f3c40e97872d,Why am I not ticklish?,0
1,487555,5f7adc919173a8ce382b,Why do people say homeopathy does not work whe...,0
2,813017,9f4c454107555556944f,What if someone merges a question you answered...,0
3,956454,bb67c4798b448b68d48e,What causes variations in DLC?,0
4,935420,b750325197b2ba03ec35,Can indefinite integrals have different soluti...,0


In [3]:
#Let's check the dimension of the dataset
rows = dataset.shape[0]
print("number of rows: " + str(rows))
columns = dataset.shape[1]
print("number of columns: " + str(columns))

number of rows: 10000
number of columns: 4


In [4]:
#let's check the percentange of positive and negative examples
positive = 0
for row in dataset.itertuples():
    positive += row.target

print(str(positive*100/rows) + "% of instancies are positive")
print(str(100-positive*100/rows) + "% of instancies are negative")


6.21% of instancies are positive
93.79% of instancies are negative


In [5]:
class document:
    def __init__(self,words, target):
        self.words=words #dictionary of contained words
        self.target=target

Now we have to create a database where the rows are vectors with #columns = size of vocabulary
and for each row set the corresponding columns to 1 if the question contains that word, 0 otherwise.

i.e.
V = {cat, dog, mouse}

q1: id = 1234; text = {cat, mouse}; target = 0 .  

--> corresponding row: 

| 1234 (id) | 1 (cat) | 0 (dog) | 1 (mouse) | 0 (target) |



The following 2 functions will take care of the preprocessing:

In [6]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

### relative path of the file
stopwords_path ="./stopwords.txt"  
stopwords = open(stopwords_path).read()
stopwords_list=stopwords.split()
for stop in stopwords_list:
    temp = stemmer.stem(stop)
    if not (temp in stopwords_list):
    
       stopwords_list.append(temp)

In [7]:
def remove_punt_and_number(string):
    remove_list=['0','1','2','3','4','5','6','7','8','9','?','.' ]
    #remove_list=[]
    string2=''
    for i in range(0,len(string)):
        if not(string[i] in remove_list):
            string2=string2 +string[i]
    return string2

In [8]:
from nltk.tokenize import RegexpTokenizer
import re

def parse2(string):
   # re.findall
    tokens=[]
    tokenizer=RegexpTokenizer('([a-z]*[-]?[a-z]+[-]?[a-z]*|[0-9]+[-]?[a-z]+|[a-z]+[-]?[0-9]+)') #matches only alphabetical characters
    tokens=tokenizer.tokenize(string.lower())
    
    #split in single words the words such as "a-class"
    tokens2 = []
    for token in tokens:
        tokens2 += token.split("-")
    
    stemmer=PorterStemmer()
    processedWords=[]
    
    for word in tokens2:
        token=remove_punt_and_number(word)
        token=stemmer.stem(token);
        #token=token.strip("-")
        alpha=re.findall('[a-z]', token)
        if not(len(alpha)<3) and not(token in stopwords_list) : #condition to eliminate useless words and StopWords
            processedWords.append(token) #stemming
    return processedWords

In [9]:
#let's create a  basic bag of word model

#we need an efficient dataStucture to build this model

#let's create a dictionary of dictionary

#questionDictionary: qid -> WordDictionary, target
#WordDictionary: word_contained_in_given_question -> #number_of_occurencies

#for each row:
    #take the text and parse it (retrieve the list of contained words and update the vocaubolary of the collection)
    #add an entry in the dictionary
    
questionDictionary = {}
vocabulary = set()

update = 0

for row in dataset.itertuples():
    wordDictionary = {}
    if (update%1000 == 0):
        print(str(round((update*100/rows),2)) + "% done." )
    update +=1
    
    ##THIS TIME LET'S USE A NEW PARSER:
    words = parse2 (row.question_text)
    vocabulary.update(words)
    
    #initialize the wordDictionary
    for word in words:
        wordDictionary[word] = 0
        
    #count the occurencies of each word
    for word in words:
        wordDictionary[word] += 1
    
    documentInstance = document (wordDictionary, row.target)
    questionDictionary[row.qid]= documentInstance
    
print("done!")

0.0% done.
10.0% done.
20.0% done.
30.0% done.
40.0% done.
50.0% done.
60.0% done.
70.0% done.
80.0% done.
90.0% done.
done!


In [10]:
#Let's check again the vocabulary
print("in the collection there are " + str(len(vocabulary)) + " distinct words." )

#print first 10 of them
print("first then words:")
print(list(vocabulary)[:10])

in the collection there are 10112 distinct words.
first then words:
['ban', 'armpit', 'lbgt', 'highschool', 'poor', 'inject', 'circadian', 'hold', 'pain', 'plaid']


In [11]:
#Let's order alphabetically the vocabulary
vocabulary = sorted(list(vocabulary))
#Let's explore the vocabulary
print(vocabulary[0:40])
print()
print(vocabulary[990:1000])

['aaa', 'aadhaar', 'aadhar', 'aakash', 'aalto', 'aaron', 'aathar', 'abathroom', 'abbi', 'abbrevi', 'abdomin', 'abduct', 'abel', 'aberdeen', 'abid', 'abil', 'abiogenesi', 'ableism', 'abnorm', 'aboard', 'abolish', 'abomin', 'abort', 'abroad', 'abscess', 'absenc', 'absolut', 'absorb', 'abstract', 'abstrus', 'absurd', 'abudhabi', 'abus', 'academ', 'academi', 'academia', 'acaj', 'acc', 'acca', 'acccid']

['bitsat', 'bitten', 'bitter', 'bittersweet', 'bizarr', 'bjmc', 'bjp', 'blabber', 'black', 'blackberri']


In [12]:
#Let's create the new DataSet

#for every question:
    #create a new row containing qID, bagOfWords, target

In [13]:
#from numba import jit

#@jit
def extractRow(qid):
    row = []
    #row.append(qid)

    #bagOfwords=[]
    question = questionDictionary[qid]
    questionWords = question.words
    for word in vocabulary:
        #if word in the text of the question:
            #bagOfwords.append(1)
        #else:
            #bagOfwords.append(0)
        ''''if word in document.words:
            bagOfwords.append(1)
        else:
            bagOfwords.append(0)'''
        
        try:
            #check if the word is contained in the question
            row.append(questionWords[word])
        except:
            #if word is not in the dictionary cach the expeption
            #and consider that word is not in that question
            row.append(0)
    #concatenate
    #row += bagOfwords
    row.append(question.target)
    return row

In [14]:
questionIndexDictionary = {}
index = 0
for qid in questionDictionary.keys():
    questionIndexDictionary[qid] = index
    index += 1

In [15]:
len(questionIndexDictionary.keys())

10000

In [16]:
wordIndexDictionary = {}
index = 0
for word in vocabulary:
    wordIndexDictionary[word] = index
    index += 1

In [17]:
len(wordIndexDictionary.keys())

10112

In [18]:
columnsName = []
columnsName += vocabulary
columnsName.append('TARGET')

In [19]:
df = pd.DataFrame([],columns=columnsName)

In [20]:
# create new DataFrame
index = 0
for qid in questionDictionary.keys():
#for qid in tqdm.tqdm(questionDictionary.keys()):
    if (index%1000 == 0):
        print(str(round((index*100/rows),2)) + "% done." )
    index +=1
    
    #qid = row.qid
    #dataset.append(extractRow(qid))    
    df.loc[questionIndexDictionary[qid]] =extractRow(qid)
print("done!")

0.0% done.
10.0% done.
20.0% done.
30.0% done.
40.0% done.
50.0% done.
60.0% done.
70.0% done.
80.0% done.
90.0% done.
done!


In [21]:
df.head()

Unnamed: 0,aaa,aadhaar,aadhar,aakash,aalto,aaron,aathar,abathroom,abbi,abbrevi,...,zookeep,zoolog,zoologist,zoroastrian,zstsn,zubeen,zuckerberg,zuni,zusak,TARGET
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.to_csv(r'./BagOfWordDataSet2.csv')