In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import random

In [2]:
def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r') as rf:
            for line in rf:
                if line.strip() != '':
                    # Note: the shared corpus is already tokenized
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data

sents0 = load_data(['FB_HI_EN_CR.txt', 'TWT_HI_EN_CR.txt', 'WA_HI_EN_CR.txt'])
sents1 = load_data(['FB_BN_EN_CR.txt', 'TWT_BN_EN_CR.txt', 'WA_BN_EN_CR.txt'])
sents2 = load_data(['FB_TE_EN_CR.txt', 'TWT_TE_EN_CR.txt', 'WA_TE_EN_CR.txt'])

In [3]:
# Train-Validation Set for hIn-Eng
random.seed(7)
random.shuffle(sents0)
train_sents0 = sents0[:int(0.8*len(sents0))]
valid_sents0 = sents0[int(0.8*len(sents0)):]
print("# Train sentences: %d" % (len(train_sents0)))
print("# Validation sentences: %d" % (len(valid_sents0)))

# Train sentences: 2104
# Validation sentences: 526


In [4]:
#Train - Validation Set for BN-EN
random.seed(7)
random.shuffle(sents1)
train_sents1 = sents1[:int(0.8*len(sents1))]
valid_sents1 = sents1[int(0.8*len(sents1)):]
print("# Train sentences: %d" % (len(train_sents1)))
print("# Validation sentences: %d" % (len(valid_sents1)))

# Train sentences: 499
# Validation sentences: 125


In [5]:
#Train - Validation Set for  TE_EN
random.seed(7)
random.shuffle(sents2)
train_sents2 = sents2[:int(0.8*len(sents2))]
valid_sents2 = sents2[int(0.8*len(sents2)):]
print("# Train sentences: %d" % (len(train_sents2)))
print("# Validation sentences: %d" % (len(valid_sents2)))

# Train sentences: 1583
# Validation sentences: 396


In [6]:
print(train_sents0[0])

[['Bolo', 'hi', 'G_J']]


In [7]:
def word2features(sent, k):
    word = sent[k][0]
    features = [
        'token=%s' % (word)
    ]
    # extracting n-grams, for n=1 to 5
    for i in range(1,6):
        # if the value of n is greater than the word length, we exit the loop
        if i > len(word):
            break
        character_features = [word[j:j+i] for j in range(len(word)-i+1)]
        features.extend([
            # is count of individual n-grams important? is the order important?
            "char-%d-gram=%s" % (i, ' '.join(list(set(character_features))))
        ])
    if k == 0:
        # first word in the sentence
        features.append('BOS')
    else:
        features.extend([
            "-1:word=%s" % (sent[k-1][0])
        ])
    if i == len(sent):
        # last word in the sentence         
        features.append('EOS')
 
    return features
        
def sent2features(sent):
    # generating features for all the words/tokens in a sentence `sent`    
    return [word2features(sent, i) for i in range(len(sent))]

def sent2langs(sent):
    return [language_label for token, language_label, pos_tag in sent]

def sent2pos(sent):
    return [pos_tag for token, language_label, pos_tag in sent]

def sent2tokens(sent):
    return [token for token, language_label, pos_tag in sent]

In [8]:
%%time
X_train0 = [sent2features(sent) for sent in train_sents0]
y_train0 = [sent2langs(sent) for sent in train_sents0]
# for training a pos-tagging system
# y_train = [sent2pos(sent) for sent in train_sents]

X_test0 = [sent2features(sent) for sent in valid_sents0]
y_test0 = [sent2langs(sent) for sent in valid_sents0]
# y_test = [sent2pos(sent) for sent in valid_sents]

CPU times: user 360 ms, sys: 8 ms, total: 368 ms
Wall time: 365 ms


In [9]:
%%time
X_train1 = [sent2features(sent) for sent in train_sents1]
y_train1 = [sent2langs(sent) for sent in train_sents1]
# for training a pos-tagging system
# y_train = [sent2pos(sent) for sent in train_sents]

X_test1 = [sent2features(sent) for sent in valid_sents1]
y_test1 = [sent2langs(sent) for sent in valid_sents1]
# y_test = [sent2pos(sent) for sent in valid_sents]

CPU times: user 244 ms, sys: 4 ms, total: 248 ms
Wall time: 246 ms


In [10]:
%%time
X_train2 = [sent2features(sent) for sent in train_sents2]
y_train2 = [sent2langs(sent) for sent in train_sents2]
# for training a pos-tagging system
# y_train = [sent2pos(sent) for sent in train_sents]

X_test2 = [sent2features(sent) for sent in valid_sents2]
y_test2 = [sent2langs(sent) for sent in valid_sents2]
# y_test = [sent2pos(sent) for sent in valid_sents]

CPU times: user 312 ms, sys: 8 ms, total: 320 ms
Wall time: 321 ms


In [11]:
print(X_train0[0])

[['token=Bolo', 'char-1-gram=B o l', 'char-2-gram=lo Bo ol', 'char-3-gram=Bol olo', 'char-4-gram=Bolo', 'BOS']]


In [13]:
%%time
trainer0 = pycrfsuite.Trainer(verbose=False)

for xseq0, yseq0 in zip(X_train0, y_train0):
    trainer0.append(xseq0, yseq0)

CPU times: user 264 ms, sys: 4 ms, total: 268 ms
Wall time: 269 ms


In [14]:
%%time
trainer1 = pycrfsuite.Trainer(verbose=False)

for xseq1, yseq1 in zip(X_train1, y_train1):
    trainer1.append(xseq1, yseq1)

CPU times: user 72 ms, sys: 8 ms, total: 80 ms
Wall time: 78.4 ms


In [15]:
%%time
trainer2 = pycrfsuite.Trainer(verbose=False)

for xseq2, yseq2 in zip(X_train2, y_train2):
    trainer2.append(xseq2, yseq2)

CPU times: user 184 ms, sys: 8 ms, total: 192 ms
Wall time: 192 ms


In [16]:
trainer0.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [17]:
%%time
trainer0.train('icon2014_lid.crfsuite')

trainer1.train('icon2014_lid.crfsuite')


trainer2.train('icon2014_lid.crfsuite')

CPU times: user 6.9 s, sys: 88 ms, total: 6.99 s
Wall time: 6.99 s


In [18]:
tagger = pycrfsuite.Tagger()
tagger.open('icon2014_lid.crfsuite')

<contextlib.closing at 0x7fc713e957f0>

In [22]:
example_sent0 = valid_sents0[10]
print(' '.join(sent2tokens(example_sent0)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent0))))
print("Correct:  ", ' '.join(sent2langs(example_sent0)))

@KumarVkd @vijaystambh grow up ... manavhit ki soch bhai hindu hit aur muslim hit dono aapne aap ho jaygi ...

Predicted: univ univ univ univ univ univ univ univ univ univ en te en en te univ univ univ univ univ
Correct:   univ univ en en univ hi hi hi hi hi en hi hi en hi hi hi hi hi univ


In [23]:
example_sent1 = valid_sents1[10]
print(' '.join(sent2tokens(example_sent1)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent1))))
print("Correct:  ", ' '.join(sent2langs(example_sent1)))

@jeet30 2002 theke dekha amar swapno ta puron hoyechilo 2013 e aajker dine ! Sarajibon e din ta spcl hoye thakbe amar kache ! ðŸ’—

Predicted: univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ univ
Correct:   univ en bn bn bn bn bn bn bn en bn bn bn univ bn bn bn bn en bn bn bn bn univ undef


In [24]:
example_sent2 = valid_sents2[10]
print(' '.join(sent2tokens(example_sent2)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent2))))
print("Correct:  ", ' '.join(sent2langs(example_sent2)))

Sarainodu release eppudu

Predicted: univ en te
Correct:   te en te
