# Data loading and preparation

### Loading required python libraries

In [1]:
import pandas as pd

In [2]:
# Loading and checking the training dataset
df_train = pd.read_csv('./../data/training_data.txt', header=None)
df_train.head()

Unnamed: 0,0
0,put the cone on the red square on the square
1,take the cone on the square
2,take the block
3,put the red cone on the square
4,put the block on the blue square on the square


In [3]:
# Loading and checking the test dataset
df_test = pd.read_csv('./../data/test_data_v0.txt', header=None)
df_test.head()

Unnamed: 0,0
0,take the block on the green circle\t\t\t
1,put the block on the circle on the red circle\t\t
2,put the green cone on the square\t\t\t
3,take the red cone\t\t\t\t\t
4,put the green block on the square\t\t\t


In [4]:
# Remove the unnecessary trailing tabs in test dataset 
test = df_test[0].map(str.strip)

test.head()

0               take the block on the green circle
1    put the block on the circle on the red circle
2                 put the green cone on the square
3                                take the red cone
4                put the green block on the square
Name: 0, dtype: object

In [5]:
# Convert the train dataset to a pandas series
train = df_train[0]

train.head()

0      put the cone on the red square on the square
1                       take the cone on the square
2                                    take the block
3                    put the red cone on the square
4    put the block on the blue square on the square
Name: 0, dtype: object

In [6]:
# Spliting the training dataset into response and predictors
y_train = train.map(lambda x: x.split()[0])
x_train = train.map(lambda x: ' '.join(x.split()[1:]))

In [7]:
# Spliting the test dataset into response and predictors
y_test = test.map(lambda x: x.split()[0])
x_test = test.map(lambda x: ' '.join(x.split()[1:]))

## Q1 Analysis

### Loading Required python libraries

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [9]:
# Run this if nltk is not configured before
# =========================================
# nltk.download()

In [10]:
# Define stop words
stop_words = set(stopwords.words('english'))

In [11]:
def removeStopWords(x):
    """Return only words that are not in stop_words"""
    return [w for w in x if not w in stop_words]

In [12]:
def getLemma(x):
    """Return the lemma of each word"""
    return [WordNetLemmatizer().lemmatize(w) for w in x]

In [13]:
# Tokenize each sentence in the training set, remove stop-words and take the lemma
x = x_train.map(word_tokenize).map(removeStopWords).map(getLemma)

### Calculate Word Counts

In [14]:
# Get Unigram Word Counts
unigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in rec]).map(nltk.FreqDist)
unigram_wcounts = pd.DataFrame(list(unigram_wcounts), index=unigram_wcounts.index)
unigram_wcounts

Unnamed: 0_level_0,block,blue,circle,cone,cube,green,red,square
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
put,26,23,45,23,16,10,47,47
take,21,5,6,6,8,7,11,4


In [15]:
# Get Bigram Word Counts
bigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in nltk.bigrams(rec)]).map(nltk.FreqDist)
bigram_wcounts = pd.DataFrame(list(bigram_wcounts), index=bigram_wcounts.index)
bigram_wcounts

Unnamed: 0_level_0,"(block, blue)","(block, circle)","(block, green)","(block, red)","(block, square)","(blue, block)","(blue, circle)","(blue, cone)","(blue, cube)","(blue, square)",...,"(green, square)","(red, block)","(red, circle)","(red, cone)","(red, cube)","(red, square)","(square, blue)","(square, circle)","(square, red)","(square, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,6,5,3.0,9,3,4,5.0,6,3,5,...,2.0,6,16,6,2,17.0,1.0,3.0,4.0,4.0
take,1,1,,1,1,2,,1,1,1,...,,6,3,1,1,,,,,


In [16]:
# Get Trigram Word Counts
trigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in nltk.trigrams(rec)]).map(nltk.FreqDist)
trigram_wcounts = pd.DataFrame(list(trigram_wcounts), index=trigram_wcounts.index)
trigram_wcounts

Unnamed: 0_level_0,"(block, blue, circle)","(block, blue, square)","(block, circle, blue)","(block, circle, circle)","(block, circle, red)","(block, circle, square)","(block, green, circle)","(block, green, square)","(block, red, circle)","(block, red, square)",...,"(red, cone, green)","(red, cone, square)","(red, cube, red)","(red, cube, square)","(red, square, circle)","(red, square, red)","(red, square, square)","(square, blue, circle)","(square, red, circle)","(square, red, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,2.0,4,1.0,1.0,1.0,1.0,2.0,1.0,4,5.0,...,1.0,3.0,2.0,,1.0,1.0,1.0,1.0,2.0,2.0
take,,1,,,,,,,1,,...,,,,1.0,,,,,,


### Calculate Total Word Counts

In [17]:
# Unigram total counts
unigram_total_wcount = unigram_wcounts.sum(axis=1)
unigram_total_wcount

0
put     237
take     68
dtype: int64

In [18]:
# Bigram total counts
bigram_total_wcount = bigram_wcounts.sum(axis=1)
bigram_total_wcount

0
put     172.0
take     33.0
dtype: float64

In [19]:
# Trigram total counts
trigram_total_wcount = trigram_wcounts.sum(axis=1)
trigram_total_wcount

0
put     107.0
take      9.0
dtype: float64

### Calculate Probabilities

In [23]:
unigram_probs = unigram_wcounts.div(unigram_total_wcount, axis=0)
unigram_probs

Unnamed: 0_level_0,block,blue,circle,cone,cube,green,red,square
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
put,0.109705,0.097046,0.189873,0.097046,0.067511,0.042194,0.198312,0.198312
take,0.308824,0.073529,0.088235,0.088235,0.117647,0.102941,0.161765,0.058824


In [24]:
bigram_probs = bigram_wcounts.div(bigram_total_wcount, axis=0)
bigram_probs

Unnamed: 0_level_0,"(block, blue)","(block, circle)","(block, green)","(block, red)","(block, square)","(blue, block)","(blue, circle)","(blue, cone)","(blue, cube)","(blue, square)",...,"(green, square)","(red, block)","(red, circle)","(red, cone)","(red, cube)","(red, square)","(square, blue)","(square, circle)","(square, red)","(square, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,0.034884,0.02907,0.017442,0.052326,0.017442,0.023256,0.02907,0.034884,0.017442,0.02907,...,0.011628,0.034884,0.093023,0.034884,0.011628,0.098837,0.005814,0.017442,0.023256,0.023256
take,0.030303,0.030303,,0.030303,0.030303,0.060606,,0.030303,0.030303,0.030303,...,,0.181818,0.090909,0.030303,0.030303,,,,,


In [25]:
trigram_probs = trigram_wcounts.div(trigram_total_wcount, axis=0)
trigram_probs

Unnamed: 0_level_0,"(block, blue, circle)","(block, blue, square)","(block, circle, blue)","(block, circle, circle)","(block, circle, red)","(block, circle, square)","(block, green, circle)","(block, green, square)","(block, red, circle)","(block, red, square)",...,"(red, cone, green)","(red, cone, square)","(red, cube, red)","(red, cube, square)","(red, square, circle)","(red, square, red)","(red, square, square)","(square, blue, circle)","(square, red, circle)","(square, red, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,0.018692,0.037383,0.009346,0.009346,0.009346,0.009346,0.018692,0.009346,0.037383,0.046729,...,0.009346,0.028037,0.018692,,0.009346,0.009346,0.009346,0.009346,0.018692,0.018692
take,,0.111111,,,,,,,0.111111,,...,,,,0.111111,,,,,,


### Predictions

In [59]:
def getUnigramProb(word):
    try:
        pProb = unigram_probs.loc['put', word]
    except:
        pProb = 0
    try:
        tProb = unigram_probs.loc['take', word]
    except:
        tProb = 0
        
    return {
        'pProb': pProb,
        'tProb': tProb,
    }
getUnigramProb('blue')

{'pProb': 0.0970464135021097, 'tProb': 0.07352941176470588}

In [60]:
def getBigramProb(word):
    try:
        pProb = bigram_probs[word]['put']
    except:
        pProb = 0
    try:
        tProb = bigram_probs[word]['take']
    except:
        tProb = 0
        
    return {
        'pProb': pProb,
        'tProb': tProb,
    }
getBigramProb(('block', 'blue'))

{'pProb': 0.03488372093023256, 'tProb': 0.030303030303030304}

In [61]:
def getTrigramProb(word):
    try:
        pProb = trigram_probs[word]['put']
    except:
        pProb = 0
    try:
        tProb = trigram_probs[word]['take']
    except:
        tProb = 0
        
    return {
        'pProb': pProb,
        'tProb': tProb,
    }
getTrigramProb(('block', 'circle', 'circle'))

{'pProb': 0.009345794392523364, 'tProb': nan}

In [62]:
# Prepare the test set
x2 = x_test.map(word_tokenize).map(removeStopWords).map(getLemma)
x2

0                      [block, green, circle]
1                [block, circle, red, circle]
2                       [green, cone, square]
3                                 [red, cone]
4                      [green, block, square]
5                   [blue, cone, red, circle]
6                         [cube, red, circle]
7    [blue, cone, red, circle, green, circle]
8                      [block, green, circle]
9                       [blue, block, circle]
Name: 0, dtype: object

In [70]:
def predict(sent, predType='uni'):
    pProb = 0
    tProb = 0
    
    for w in sent:
        if predType == 'uni':
            p = getUnigramProb(w)
        elif predType == 'bi':
            p = getBigramProb(w)
        else:
            p = getTrigramProb(w)
        pProb += p['pProb']
        tProb += p['tProb']
    
    res = 'put' if pProb > tProb else 'take'
    
    return {
        'prediction': res,
        'pProb': pProb,
        'tProb': tProb
    }

In [82]:
unigram_prediction = x2.map(predict)
unigram_prediction

0    {'tProb': 0.5, 'pProb': 0.3417721518987342, 'p...
1    {'tProb': 0.6470588235294118, 'pProb': 0.68776...
2    {'tProb': 0.25, 'pProb': 0.33755274261603374, ...
3    {'tProb': 0.25, 'pProb': 0.29535864978902954, ...
4    {'tProb': 0.47058823529411764, 'pProb': 0.3502...
5    {'tProb': 0.411764705882353, 'pProb': 0.582278...
6    {'tProb': 0.36764705882352944, 'pProb': 0.4556...
7    {'tProb': 0.6029411764705883, 'pProb': 0.81434...
8    {'tProb': 0.5, 'pProb': 0.3417721518987342, 'p...
9    {'tProb': 0.4705882352941177, 'pProb': 0.39662...
Name: 0, dtype: object

In [76]:
x2_bigram = x2.map(lambda x: list(nltk.bigrams(x)))
x2_bigram

0                    [(block, green), (green, circle)]
1      [(block, circle), (circle, red), (red, circle)]
2                      [(green, cone), (cone, square)]
3                                        [(red, cone)]
4                    [(green, block), (block, square)]
5           [(blue, cone), (cone, red), (red, circle)]
6                         [(cube, red), (red, circle)]
7    [(blue, cone), (cone, red), (red, circle), (ci...
8                    [(block, green), (green, circle)]
9                     [(blue, block), (block, circle)]
Name: 0, dtype: object

In [85]:
bigram_prediction = x2_bigram.map(lambda x: predict(x, 'bi'))
bigram_prediction

0    {'tProb': nan, 'pProb': 0.040697674418604654, ...
1    {'tProb': nan, 'pProb': 0.1511627906976744, 'p...
2    {'tProb': nan, 'pProb': 0.05232558139534884, '...
3    {'tProb': 0.030303030303030304, 'pProb': 0.034...
4    {'tProb': 0.18181818181818182, 'pProb': 0.0290...
5    {'tProb': nan, 'pProb': 0.1744186046511628, 'p...
6    {'tProb': 0.15151515151515152, 'pProb': 0.1337...
7    {'tProb': nan, 'pProb': 0.19767441860465118, '...
8    {'tProb': nan, 'pProb': 0.040697674418604654, ...
9    {'tProb': 0.09090909090909091, 'pProb': 0.0523...
Name: 0, dtype: object

In [84]:
trigram_prediction = x2_trigram.map(lambda x: predict(x, 'tri'))
trigram_prediction

0    {'tProb': nan, 'pProb': 0.018691588785046728, ...
1    {'tProb': nan, 'pProb': 0.037383177570093455, ...
2       {'tProb': 0, 'pProb': 0, 'prediction': 'take'}
3       {'tProb': 0, 'pProb': 0, 'prediction': 'take'}
4       {'tProb': 0, 'pProb': 0, 'prediction': 'take'}
5    {'tProb': nan, 'pProb': 0.06542056074766354, '...
6    {'tProb': 0.2222222222222222, 'pProb': 0.02803...
7    {'tProb': nan, 'pProb': 0.06542056074766354, '...
8    {'tProb': nan, 'pProb': 0.018691588785046728, ...
9       {'tProb': 0, 'pProb': 0, 'prediction': 'take'}
Name: 0, dtype: object

### Analysis of the results

#### Unigram

In [88]:
unigram_prediction_comparison = unigram_prediction.map(lambda x: x['prediction']) == y_test
unigram_prediction_comparison

0     True
1     True
2     True
3    False
4    False
5    False
6    False
7     True
8    False
9    False
Name: 0, dtype: bool

In [105]:
unigram_test_accuracy = unigram_prediction_comparison.sum()/10
unigram_test_accuracy

0.4

In [106]:
unigram_train_accuracy = (x.map(predict).map(lambda x: x['prediction']) == y_train).sum()/len(x)
unigram_train_accuracy

0.72

#### Bigram

In [107]:
bigram_prediction_comparison_test = bigram_prediction.map(lambda x: x['prediction']) == y_test
bigram_prediction_comparison

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7    False
8    False
9    False
Name: 0, dtype: bool

In [120]:
bigram_test_accuracy = bigram_prediction_comparison.sum()/10
bigram_test_accuracy

0.3

In [119]:
bigram_train_accuracy = (x.map(lambda x: list(nltk.bigrams(x))).map(lambda x: predict(x, 'bi')).map(lambda x: x['prediction']) == y_train).sum()/len(x)
bigram_train_accuracy

0.37

In [94]:
trigram_prediction_comparison = trigram_prediction.map(lambda x: x['prediction']) == y_test
trigram_prediction_comparison

0     True
1    False
2    False
3     True
4    False
5     True
6     True
7    False
8    False
9    False
Name: 0, dtype: bool

In [96]:
trigram_prediction_comparison.sum()/10

0.4

In [None]:
bigram_train_accuracy = (x.map(lambda x: list(nltk.bigrams(x))).map(lambda x: predict(x, 'bi')).map(lambda x: x['prediction']) == y_train).sum()/len(x)
bigram_train_accuracy