- Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers

Using TensorFlow backend.


- Reading files

In [2]:
df_eng = pd.read_csv('english_text.csv')
df_hin = pd.read_csv('hinglish_text.csv')

In [3]:
df_eng['label'] = 0
df_hin['label'] = 1
df = pd.concat([df_eng,df_hin])

In [4]:
texts = df['text']
labels = df['label']

- Splitting data

In [5]:
text_train, text_test, y_train, y_test = train_test_split(texts, labels, test_size=0.8, random_state=1000)

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test  = vectorizer.transform(text_test)
X_train

<11835x23926 sparse matrix of type '<class 'numpy.int64'>'
	with 232229 stored elements in Compressed Sparse Row format>

- Logistic Regression

In [7]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
score = log_clf.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.994550178492216


- Decision Tree Classifier

In [8]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
score = dt_clf.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9959231955387507


- Random Forest Classifier

In [9]:
rf_clf = RandomForestClassifier(max_depth=160, random_state=0)
rf_clf.fit(X_train, y_train)
score = rf_clf.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9968948691409137


- Tokenization 

In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text_train)

In [11]:
X_train = tokenizer.texts_to_sequences(text_train)
X_test = tokenizer.texts_to_sequences(text_test)

In [12]:
vocab_size = len(tokenizer.word_index) + 1

In [13]:
vocab_size

23952

- Padding

In [14]:
maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])

[  7  63 188   2 332  13 631 398   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


- Fully Connected DNN Model

In [15]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 50)           1197600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                100010    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 1,297,621
Trainable params: 1,297,621
Non-trainable params: 0
_________________________________________________________________


In [16]:
EPOCHS=2
BATCH_SIZE=10
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=BATCH_SIZE)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training Accuracy: 0.9993
Testing Accuracy:  0.9986
