In [1]:
# dataframe management
import pandas as pd             

# numerical computation
import numpy as np

# visualization library
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   


# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('train.csv')
#dataset = pd.read_csv('./subdataset.csv')
dataset.head() #show the first n instances

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
#Let's check the dimension of the dataset
rows = dataset.shape[0]
print("number of rows: " + str(rows))
columns = dataset.shape[1]
print("number of columns: " + str(columns))

number of rows: 1306122
number of columns: 3


The dataset is very big: creating a BagOfWord model and train algorithms on that will be too expensive.
Because of the fact that the performance of the classifier will not be a criteria for the project I will take a subSet of the dataset and from now on I will work on that.

In [5]:
dataset = dataset.sample(10000)

dataset.to_csv(r'./subdataset.csv')

In [6]:
#Let's check again the dimension of the dataset
rows = dataset.shape[0]
print("number of rows: " + str(rows))
columns = dataset.shape[1]
print("number of columns: " + str(columns))

number of rows: 10000
number of columns: 3


In [9]:
#let's check the percentange of positive and negative examples
positive = 0
for row in dataset.itertuples():
    positive += row.target

print(str(positive*100/rows) + "% of instancies are positive")
print(str(100-positive*100/rows) + "% of instancies are negative")

6.21% of instancies are positive
93.79% of instancies are negative


In [10]:
class document:
    def __init__(self,words, target):
        self.words=words #dictionary of contained words
        self.target=target

In [11]:
def parse(text):
    words = text.split()
    wordsLowerCase = []
    for word in words:
        wordsLowerCase.append(word.lower())
    return wordsLowerCase

In [13]:
#let's create a dictionary of dictionary

#questionDictionary: qid -> WordDictionary, target
#WordDictionary: word_contained_in_given_question -> #number_of_occurencies

#for each row:
    #take the text and parse it (retrieve the list of contained words and update the vocaubolary of the collection)
    #add an entry in the dictionary
    
questionDictionary = {}
vocabulary = set()

update = 0

for row in dataset.itertuples():
    wordDictionary = {}
    if (update%1000 == 0):
        print(str(round((update*100/rows),2)) + "% done." )
    update +=1
    
    words = parse (row.question_text)
    vocabulary.update(words)
    
    #initialize the wordDictionary
    for word in words:
        wordDictionary[word] = 0
        
    #count the occurencies of each word
    for word in words:
        wordDictionary[word] += 1
    
    documentInstance = document (wordDictionary, row.target)
    questionDictionary[row.qid]= documentInstance
    
print("done!")

0.0% done.
10.0% done.
20.0% done.
30.0% done.
40.0% done.
50.0% done.
60.0% done.
70.0% done.
80.0% done.
90.0% done.
done!


In [14]:
#Let's check the vocabulary
print("in the collection there are " + str(len(vocabulary)) + " distinct words." )

#print first 10 of them
print("first then words:")
print(list(vocabulary)[:10])

in the collection there are 21359 distinct words.
first then words:
['columbia', 'americans,', 'supporting', 'regulation', 'copper', 'rap', 'detox', 'unacademy?', 'someday?', 'dubstep']


In [15]:
#Let's order alphabetically the vocabulary
vocabulary = sorted(list(vocabulary))
#Let's explore the vocabulary
print(vocabulary[0:40])
print()
print(vocabulary[990:1000])

['!', '!"?', '"', '"/proj3/mypipe"', '"100%', '"104"', '"200', '"700000"', '"?', '"a', '"abiogenesis"?', '"age', '"ai', '"alice\'s', '"all', '"american', '"andare', '"anger', '"anus', '"are', '"arm"', '"as', '"barkha', '"because', '"being', '"beliefs"?', '"big', '"bites".', '"black', '"blibber-blubber"', '"bluetooth', '"bn', '"bougie"', '"bubble', '"bud,', '"bundmar"', '"bye"', '"bytes"?', '"c"', '"c",']

['2?', '2^x', '2a', '2d', '2examples', '2k18', '2k18?', '2l', '2nd', '2no+o2>2no2?']


We can notice that the vocabulary contains a lot of elements that are not actually words and this increases by a lot the size.
Before building the BagOfWord Model we should apply a better parser to extract only actual words and take off usless words.
For now let's continue in this way, further preprocessing will be performed in other notebook (Preprocessing1,..) and the performance of the model from the different BagOfWords models will be compared.

Now we have to create a database where the rows are vectors with #columns = size of vocabulary
and for each row set the corresponding columns to 1 if the question contains that word, 0 otherwise.

i.e.
V = {cat, dog, mouse}

q1: id = 1234; text = {cat, mouse}; target = 0 .  

--> corresponding row: 

| 1234 (id) | 1 (cat) | 0 (dog) | 1 (mouse) | 0 (target) |

In [16]:
#from numba import jit

#@jit
def extractRow(qid):
    row = []
    #row.append(qid)

    #bagOfwords=[]
    question = questionDictionary[qid]
    questionWords = question.words
    for word in vocabulary:
        #if word in the text of the question:
            #bagOfwords.append(1)
        #else:
            #bagOfwords.append(0)        
        try:
            #check if the word is contained in the question
            row.append(questionWords[word])
        except:
            #if word is not in the dictionary cach the expeption
            #and consider that word is not in that question
            row.append(0)
    #concatenate
    #row += bagOfwords
    row.append(question.target)
    return row

In [17]:
questionIndexDictionary = {}
index = 0
for qid in questionDictionary.keys():
    questionIndexDictionary[qid] = index
    index += 1

Let's create an new Dataframe for the BagOfWordModel:

In [18]:
columnsName = []
columnsName += vocabulary
columnsName.append('TARGET')

In [19]:
df = pd.DataFrame([],columns=columnsName)

In [20]:
# creaimo matrice
index = 0
for qid in questionDictionary.keys():
#for qid in tqdm.tqdm(questionDictionary.keys()):
    if (index%1000 == 0):
        print(str(round((index*100/rows),2)) + "% done." )
    index +=1
    
    #qid = row.qid
    #dataset.append(extractRow(qid))    
    df.loc[questionIndexDictionary[qid]] =extractRow(qid)
print("done!")

0.0% done.
10.0% done.
20.0% done.
30.0% done.
40.0% done.
50.0% done.
60.0% done.
70.0% done.
80.0% done.
90.0% done.
done!


In [21]:
df.head()

Unnamed: 0,!,"!""?","""","""/proj3/mypipe""","""100%","""104""","""200","""700000""","""?","""a",...,“writing”,…,…just,…only,₹,"₹15,000",√,√3√3√3√3√3?,❓?,TARGET
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.to_csv(r'./BagOfWordDataSet0.csv')