In [0]:
import numpy as np
from sklearn import linear_model, preprocessing
import os
import glob
import pandas as pd
import nltk
from nltk import tokenize
from nltk.stem import WordNetLemmatizer
import re 
# nltk.download('stopwords')
# nltk.download()
from nltk.corpus import stopwords 

In [0]:
nltk.download('punkt')
nltk.download('all')
nltk.download('averaged_perceptron_tagger')

In [0]:
def preprocess_sentences(X):
  documents = []
  stemmer = WordNetLemmatizer()
  for sen in range(0, len(X)):
    document = re.sub(r'\W', ' ', str(X[sen]))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'^"', '', document)
    document = re.sub(r'"$', '', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'^b\s+', '', document)
    document = document.lower()
    #     document = document.split()
    #     document = [stemmer.lemmatize(word) for word in document]
    #     document = ' '.join(document)
    documents.append(document)
  print(len(documents))
  return documents

In [0]:
# nltk.help.upenn_tagset()

In [0]:
def load_test(test):
  df= pd.read_csv(test,delimiter='\t',encoding='utf-8')
  ts_st=np.array(df['sentence'].values)
  ts_ft=np.array(df['Future'].values)
  ts_pt=np.array(df['Past'].values)
  return ts_st,ts_ft,ts_pt

In [0]:
def load_train_val(train,val):
  df= pd.read_csv(train,delimiter='\t',encoding='utf-8')
  tr_st=np.array(df['sentence'].values)
  tr_ft=np.array(df['Future'].values)
  tr_pt=np.array(df['Past'].values)
  df= pd.read_csv(val,delimiter='\t',encoding='utf-8')
  val_st=np.array(df['sentence'].values)
  val_ft=np.array(df['Future'].values)
  val_pt=np.array(df['Past'].values)
  return tr_st,tr_ft,tr_pt, val_st,val_ft,val_pt


In [0]:
def detect_past(document):
  past=False
  tokens=nltk.word_tokenize(document)
  word_pos=nltk.pos_tag(tokens)
  pos=[pos for (word,pos) in word_pos]
#   print(pos)
  if ('VBD' in pos) or ('VBN' in pos):
    past=True
  return past

In [193]:
detect_future('I am going to present it, tomorrow')

['I', 'am', 'going', 'to', 'present', 'it', ',', 'tomorrow']
['PRP', 'VBP', 'VBG', 'TO', 'VB', 'PRP', ',', 'NN']


True

In [0]:
def detect_future(document):
  future=False
  tokens=nltk.word_tokenize(document)
  word_pos=nltk.pos_tag(tokens)
  pos=[pos for (word,pos) in word_pos]
  words=[word for (word,pos) in word_pos]
#   print(words)
#   print(pos)
  for i in range(len(pos)):
    if(i+1<len(pos)):
      #modal verb
      if((pos[i] == 'MD') and (words[i] in ['shall','will','\'ll']) and ('VB' in pos[i+1:])):
        future=True
    if(i+3<len(pos)):
      #going-to future 
      if((pos[i]=='VBP') and (pos[i+1]=='VBG') and (pos[i+2]=='TO') and ('VB' in pos[i+2:])):
        future=True
      if(i>0):
        if((('VBP' in pos[:i]) or 'VBZ' in pos[:i]) and (words[i+1]=='going') and (pos[i+2]=='TO') and ('VB' in pos[i+2:])):
          future=True
  return future

In [87]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
datapath = 'gdrive/My Drive/data/'

In [11]:
trainfile=datapath+'train.txt'
valfile=datapath+'validation.txt'
tr_st,tr_ft,tr_pt, val_st,val_ft,val_pt=load_train_val(trainfile,valfile)
testfile=datapath+'test.txt'
ts_st,ts_ft,ts_pt=load_test(testfile)

tr_st = preprocess_sentences(tr_st)
val_st = preprocess_sentences(val_st)
ts_st = preprocess_sentences(ts_st)

1776
592
593


**Rule Based Baseline:**

In [201]:
pt_pred=[(1 if detect_past(sent) else 0) for sent in ts_st]
acc = np.mean(pt_pred == ts_pt)
print('Test Past Accuaracy = {0:f}'.format(acc))
ft_pred=[(1 if detect_future(sent) else 0) for sent in ts_st]
acc = np.mean(ft_pred == ts_ft)
print('Test Future Accuaracy = {0:f}'.format(acc))

Test Past Accuaracy = 0.691400
Test Future Accuaracy = 0.634064


In [202]:
pt_pred=[(1 if detect_past(sent) else 0) for sent in tr_st]
acc = np.mean(pt_pred == tr_pt)
print('Train Past Accuaracy = {0:f}'.format(acc))
ft_pred=[(1 if detect_future(sent) else 0) for sent in tr_st]
acc = np.mean(ft_pred == tr_ft)
print('Train Future Accuaracy = {0:f}'.format(acc))

Train Past Accuaracy = 0.724099
Train Future Accuaracy = 0.554617


In [203]:
pt_pred=[(1 if detect_past(sent) else 0) for sent in val_st]
acc = np.mean(pt_pred == val_pt)
print('Val Past Accuaracy = {0:f}'.format(acc))
ft_pred=[(1 if detect_future(sent) else 0) for sent in val_st]
acc = np.mean(ft_pred == val_ft)
print('Val Future Accuaracy = {0:f}'.format(acc))

Val Past Accuaracy = 0.736486
Val Future Accuaracy = 0.548986


**Rule Based Baseline on all the labeled data:**

In [204]:
pt_pred=[(1 if detect_past(sent) else 0) for sent in np.concatenate([tr_st,val_st,ts_st])]
acc = np.mean(pt_pred == np.concatenate([tr_pt,val_pt,ts_pt]))
print('Total Past Accuaracy = {0:f}'.format(acc))
ft_pred=[(1 if detect_future(sent) else 0) for sent in np.concatenate([tr_st,val_st,ts_st])]
acc = np.mean(ft_pred == np.concatenate([tr_ft,val_ft,ts_ft]))
print('Total Future Accuaracy = {0:f}'.format(acc))

Total Past Accuaracy = 0.720027
Total Future Accuaracy = 0.569402


**Most Frequent Class Baseline:**

In [0]:
from collections import Counter

def majority_acc(labels,test):
  unique_vals = Counter(labels).keys()
  unique_counts = Counter(labels).values()
  maj=np.fromiter(unique_vals,dtype=int)[np.argmax(np.fromiter(unique_counts,dtype=int))]
  labels_maj=np.full((len(test),),maj)
  acc = np.mean(labels_maj == test)
  return acc
#   print('Past Accuaracy = {0:f}'.format(acc))

**Most Frequent class Baseline:**

In [20]:

acc= majority_acc(np.concatenate([tr_pt,val_pt]),ts_pt)
print('Past Test maj Accuaracy = {0:f}'.format(acc))

acc= majority_acc(np.concatenate([tr_ft,val_ft]),ts_ft)
print('Future Test maj Accuaracy = {0:f}'.format(acc))

Past Test maj Accuaracy = 0.546374
Future Test maj Accuaracy = 0.505902




---



In [154]:
acc= majority_acc(tr_pt)
print('Past Train maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(ts_pt)
print('Past Test maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(val_pt)
print('Past Validation maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(tr_ft)
print('Future Train maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(ts_ft)
print('Future Test maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(val_ft)
print('Future tr maj Accuaracy = {0:f}'.format(acc))

Past Train maj Accuaracy = 0.619369
Past Test maj Accuaracy = 0.546374
Past Validation maj Accuaracy = 0.594595
Future Train maj Accuaracy = 0.574324
Future Test maj Accuaracy = 0.505902
Future tr maj Accuaracy = 0.567568


**Majority Baseline on all the labeled data**

In [205]:
acc= majority_acc(np.concatenate([tr_pt,val_pt,ts_pt]))
print('Past Total maj Accuaracy = {0:f}'.format(acc))
acc= majority_acc(np.concatenate([tr_ft,val_ft,ts_ft]))
print('Future Total maj Accuaracy = {0:f}'.format(acc))

Past Total maj Accuaracy = 0.599797
Future Total maj Accuaracy = 0.559271




---



**sklearn DummyClassifier baselines:**

In [17]:
from sklearn.dummy import DummyClassifier

print('Past: ')
for strategy in ['stratified', 'most_frequent', 'prior', 'uniform']:
    dummy = DummyClassifier(strategy=strategy)
    dummy.fit(np.concatenate([tr_st,val_st]), np.concatenate([tr_pt,val_pt]))
    print(strategy,': ',dummy.score(ts_st, ts_pt))
print('Future: ')
for strategy in ['stratified', 'most_frequent', 'prior', 'uniform']:
    dummy = DummyClassifier(strategy=strategy)
    dummy.fit(np.concatenate([tr_st,val_st]), np.concatenate([tr_ft,val_ft]))
    print(strategy,': ',dummy.score(ts_st, ts_ft))

Past: 
stratified :  0.5075885328836425
most_frequent :  0.5463743676222597
prior :  0.5463743676222597
uniform :  0.4991568296795953
Future: 
stratified :  0.5008431703204047
most_frequent :  0.5059021922428331
prior :  0.5059021922428331
uniform :  0.49409780775716694
