In [1]:
import re
import os
import keras 
import gensim
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pprint import pprint
%matplotlib inline

Using TensorFlow backend.


In [2]:
data=[]
labels=[]
total_data=[]

# Reading the english doc file
with open('./data/english.txt') as f:
    data = f.readlines()
    labele=np.zeros(len(data), dtype='int8')

    # Reading the hinglish doc file
with open('./data/hinglish.txt') as f:
    data2 = f.readlines()
    data+=data2
    labelh=np.ones(len(data2), dtype='int8')

labels=np.concatenate((labele, labelh), axis=0)
    
# Cleaning out the newline character from the data
for i, word in enumerate(data):
    total_data.append((re.sub("[^a-zA-Z]","",word).lower(),labels[i]))
print(len(total_data))

78244


In [3]:
for i in range(5):
    pprint(total_data[np.random.randint(len(data))])

('ashirwaad', 1)
('thiopental', 0)
('hauberget', 0)
('tunca', 0)
('unsagaciously', 0)


In [4]:
np.random.shuffle(total_data)
test = list(total_data[:500])
data = list(total_data[500:])
pprint(test[:10])

[('greisen', 0),
 ('himalaya', 1),
 ('thasabhaaon', 1),
 ('vellum', 0),
 ('nirwaah', 1),
 ('farcers', 0),
 ('bistros', 0),
 ('cytodieretic', 0),
 ('foliosity', 0),
 ('baarah', 1)]


In [5]:
train_x = [a[0] for a in data]
train_y = [a[1] for a in data]
test_x = [a[0] for a in test]
test_y = [a[1] for a in test]

In [6]:
char_set = sorted(set(''.join(train_x)))
print("The character set is: \n",char_set)
longest = max(sorted(test_x, key=len)[-1],sorted(train_x, key=len)[-1])
print("Maximum length word: ",longest)
maxlen = len(longest)
print("Number of characters in longest word ",maxlen)
word_count = len(train_x)
print("Words in the total dataset are %s and in train_x are %s"%(len(total_data), word_count))

The character set is: 
 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Maximum length word:  gatimayatageyatasangitamayata
Number of characters in longest word  29
Words in the total dataset are 78244 and in train_x are 77744


In [7]:
alpToDig=dict()
digToAlp=dict()
for i,j in enumerate(char_set):
    alpToDig[j]=i+1
    digToAlp[i+1]=j

In [8]:
def process_text(word):
    processed_frame_x=np.zeros((len(word),maxlen,len(char_set)),dtype='int8')
    for sample_index,sample in enumerate(word):
        for char_index, char in enumerate(sample.lower()):
            processed_frame_x[sample_index, char_index-1, alpToDig[char]-1]=1
    return processed_frame_x

In [9]:
train_frame_y = np.array(train_y)
test_frame_y = np.array(test_y)

In [10]:
train_frame_x = process_text(train_x)
test_frame_x = process_text(test_x)

In [11]:
model = keras.models.Sequential()

model.add(keras.layers.LSTM(8, input_shape=(maxlen, len(char_set))))
model.add(keras.layers.Dense(1))
model.add(keras.layers.Activation('sigmoid'))

In [12]:
model.compile( optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [13]:
history = model.fit(train_frame_x, train_frame_y,
                    batch_size=64,
                     epochs=5,
                       verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.evaluate(test_frame_x, test_frame_y)

In [None]:
words = ['Chalo','coffee','peete','hai']
processed_words = process_text(words)
pred = model.predict_classes(processed_words)
for i,p in enumerate(pred):
    if(pred[i]==0):
        print('%s is English'%words[i])
    else:
        print('%s is Hinglish'%words[i])

In [None]:
model.save('hinglish_classifier.h5')