# Data loading and preparation

### Loading required python libraries

In [1]:
import pandas as pd

In [2]:
# Loading and checking the training dataset
df_train = pd.read_csv('./../data/training_data.txt', header=None)
df_train.head()

Unnamed: 0,0
0,put the cone on the red square on the square
1,take the cone on the square
2,take the block
3,put the red cone on the square
4,put the block on the blue square on the square


In [3]:
# Loading and checking the test dataset
df_test = pd.read_csv('./../data/test_data_v0.txt', header=None)
df_test.head()

Unnamed: 0,0
0,take the block on the green circle\t\t\t
1,put the block on the circle on the red circle\t\t
2,put the green cone on the square\t\t\t
3,take the red cone\t\t\t\t\t
4,put the green block on the square\t\t\t


In [4]:
# Remove the unnecessary trailing tabs in test dataset 
test = df_test[0].map(str.strip)

test.head()

0               take the block on the green circle
1    put the block on the circle on the red circle
2                 put the green cone on the square
3                                take the red cone
4                put the green block on the square
Name: 0, dtype: object

In [5]:
# Convert the train dataset to a pandas series
train = df_train[0]

train.head()

0      put the cone on the red square on the square
1                       take the cone on the square
2                                    take the block
3                    put the red cone on the square
4    put the block on the blue square on the square
Name: 0, dtype: object

In [36]:
# Spliting the training dataset into response and predictors
y_train = train.map(lambda x: x.split()[0])
x_train = train.map(lambda x: ' '.join(x.split()[1:]))

In [37]:
# Spliting the test dataset into response and predictors
y_test = test.map(lambda x: x.split()[0])
x_test = test.map(lambda x: ' '.join(x.split()[1:]))

## Q1 Analysis

### Loading Required python libraries

In [38]:
# Run this if nltk is not configured before
# =========================================
# import nltk
# nltk.download()

In [90]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [41]:
# Define stop words
stop_words = set(stopwords.words('english'))

In [42]:
def removeStopWords(x):
    """Return only words that are not in stop_words"""
    return [w for w in x if not w in stop_words]

In [50]:
def getLemma(x):
    """Return the lemma of each word"""
    return [WordNetLemmatizer().lemmatize(w) for w in x]

In [53]:
# Tokenize each sentence in the training set and remove stop-words.
x = x_train.map(word_tokenize).map(removeStopWords).map(getLemma)

### Calculate Word Counts

In [108]:
# Get Unigram Word Counts
unigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in rec]).map(nltk.FreqDist)
unigram_wcounts = pd.DataFrame(list(unigram_wcounts), index=unigram_wcounts.index)
unigram_wcounts

Unnamed: 0_level_0,block,blue,circle,cone,cube,green,red,square
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
put,26,23,45,23,16,10,47,47
take,21,5,6,6,8,7,11,4


In [109]:
# Get Bigram Word Counts
bigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in nltk.bigrams(rec)]).map(nltk.FreqDist)
bigram_wcounts = pd.DataFrame(list(bigram_wcounts), index=bigram_wcounts.index)
bigram_wcounts

Unnamed: 0_level_0,"(block, blue)","(block, circle)","(block, green)","(block, red)","(block, square)","(blue, block)","(blue, circle)","(blue, cone)","(blue, cube)","(blue, square)",...,"(green, square)","(red, block)","(red, circle)","(red, cone)","(red, cube)","(red, square)","(square, blue)","(square, circle)","(square, red)","(square, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,6,5,3.0,9,3,4,5.0,6,3,5,...,2.0,6,16,6,2,17.0,1.0,3.0,4.0,4.0
take,1,1,,1,1,2,,1,1,1,...,,6,3,1,1,,,,,


In [111]:
# Get Trigram Word Counts
trigram_wcounts = x.groupby(y_train).apply(lambda x: [w for rec in x for w in nltk.trigrams(rec)]).map(nltk.FreqDist)
trigram_wcounts = pd.DataFrame(list(trigram_wcounts), index=trigram_wcounts.index)
trigram_wcounts

Unnamed: 0_level_0,"(block, blue, circle)","(block, blue, square)","(block, circle, blue)","(block, circle, circle)","(block, circle, red)","(block, circle, square)","(block, green, circle)","(block, green, square)","(block, red, circle)","(block, red, square)",...,"(red, cone, green)","(red, cone, square)","(red, cube, red)","(red, cube, square)","(red, square, circle)","(red, square, red)","(red, square, square)","(square, blue, circle)","(square, red, circle)","(square, red, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,2.0,4,1.0,1.0,1.0,1.0,2.0,1.0,4,5.0,...,1.0,3.0,2.0,,1.0,1.0,1.0,1.0,2.0,2.0
take,,1,,,,,,,1,,...,,,,1.0,,,,,,


### Calculate Total Word Counts

In [118]:
# Unigram total counts
unigram_total_wcount = unigram_wcounts.sum(axis=1)
unigram_total_wcount

0
put     237
take     68
dtype: int64

In [119]:
# Bigram total counts
bigram_total_wcount = bigram_wcounts.sum(axis=1)
bigram_total_wcount

0
put     172.0
take     33.0
dtype: float64

In [120]:
# Trigram total counts
trigram_total_wcount = trigram_wcounts.sum(axis=1)
trigram_total_wcount

0
put     107.0
take      9.0
dtype: float64

### Calculate Probabilities

In [137]:
unigram_wcounts.div(unigram_total_wcount, axis=0)

Unnamed: 0_level_0,block,blue,circle,cone,cube,green,red,square
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
put,0.109705,0.097046,0.189873,0.097046,0.067511,0.042194,0.198312,0.198312
take,0.308824,0.073529,0.088235,0.088235,0.117647,0.102941,0.161765,0.058824


In [138]:
bigram_wcounts.div(bigram_total_wcount, axis=0)

Unnamed: 0_level_0,"(block, blue)","(block, circle)","(block, green)","(block, red)","(block, square)","(blue, block)","(blue, circle)","(blue, cone)","(blue, cube)","(blue, square)",...,"(green, square)","(red, block)","(red, circle)","(red, cone)","(red, cube)","(red, square)","(square, blue)","(square, circle)","(square, red)","(square, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,0.034884,0.02907,0.017442,0.052326,0.017442,0.023256,0.02907,0.034884,0.017442,0.02907,...,0.011628,0.034884,0.093023,0.034884,0.011628,0.098837,0.005814,0.017442,0.023256,0.023256
take,0.030303,0.030303,,0.030303,0.030303,0.060606,,0.030303,0.030303,0.030303,...,,0.181818,0.090909,0.030303,0.030303,,,,,


In [139]:
trigram_wcounts.div(trigram_total_wcount, axis=0)

Unnamed: 0_level_0,"(block, blue, circle)","(block, blue, square)","(block, circle, blue)","(block, circle, circle)","(block, circle, red)","(block, circle, square)","(block, green, circle)","(block, green, square)","(block, red, circle)","(block, red, square)",...,"(red, cone, green)","(red, cone, square)","(red, cube, red)","(red, cube, square)","(red, square, circle)","(red, square, red)","(red, square, square)","(square, blue, circle)","(square, red, circle)","(square, red, square)"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
put,0.018692,0.037383,0.009346,0.009346,0.009346,0.009346,0.018692,0.009346,0.037383,0.046729,...,0.009346,0.028037,0.018692,,0.009346,0.009346,0.009346,0.009346,0.018692,0.018692
take,,0.111111,,,,,,,0.111111,,...,,,,0.111111,,,,,,
