## What are Word Vectors?

* Many Machine Learning algorithms and almost all Deep Learning Architectures are incapable of processing strings or plain text in their raw form. They require numbers as inputs to perform any sort of job, be it classification, regression etc. in broad terms. 

* Word Vectors are the texts converted into numbers and there may be different numerical representations of the same text.

* Example:


     [‘Word’,’Embeddings’,’are’,’Converted’,’into’,’numbers’] --> [0,0,0,1,0,0]

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]
# create the transform
vectorizer = CountVectorizer(analyzer='word')
# tokenize and build vocab
vectorizer.fit(corpus)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(corpus)
# summarize encoded vector
#print(vector.shape)
#print(type(vector))
print(vector.toarray())

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document

vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())
vector1 = vectorizer.transform([text[1]])
print(vector1.shape)
print(vector1.toarray())

## Code-Mixed Language Identification

### 1. Libraries 

In [44]:
# Libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd 

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report
import numpy as np


### 2. Read Data 

In [45]:
Words = []
LI_Tags = []
POS_Tags = []
Sent_ID = []

count=1

#Reading data file

dataFile = open("LIDataset/CodemixedShuffle.txt","r")
fileData = dataFile.readlines()
for line in fileData:
    a = line.split("\t")
    if(len(a)==3):
        Words.append(a[0].strip())
        POS_Tags.append(a[2].strip()) 
        LI_Tags.append(a[1].strip())
        Sent_ID.append(count)
    else:
        count+=1
        
# Converting data file to data frame

codeMixedData = pd.DataFrame(list(zip(Sent_ID, Words, POS_Tags, LI_Tags)),
               columns =['Sentence_No', 'Words', 'POS_Tags', 'LI_Tags'])
        
codeMixedData.head()

Unnamed: 0,Sentence_No,Words,POS_Tags,LI_Tags
0,1,T'wood,N_NN,univ
1,1,-,RD_PUNC,univ
2,1,We,PR_PRP,univ
3,1,have,V_VM,en
4,1,craze,N_NN,univ


In [48]:
df = pd.read_csv("LIDataset/CodemixedShuffle.txt", header=None, sep="\\t", names=['word', 'language', 'pos'], engine='python')
df['language'].unique()

array(['univ', 'en', 'te', 'ne', 'DM_DMD', 'N_NN', 'RB_AMN'], dtype=object)

### 3. Data Splitting and Label Encoder 

In [31]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(codeMixedData['Words'], codeMixedData['LI_Tags'], test_size=0.2, random_state=42)
print(test_y)

15412    univ
14857      te
23349      en
18819      te
18932    univ
         ... 
11782      en
5676       en
10950      en
15436    univ
16511      en
Name: LI_Tags, Length: 5900, dtype: object


###  4.1 Feature Engineering:  Count Vectorizer 

In [27]:
count_vect = CountVectorizer(min_df=1)

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.fit_transform(train_x).toarray()
xvalid_count =  count_vect.transform(test_x).toarray()

xvalid_count.shape, xtrain_count.shape

((5900, 6440), (23596, 6440))

###  4.2 Feature Engineering:  TF-IDF Vectorizer  

In [28]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(min_df=1)
tfidf_vect.fit(text)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(min_df=1)
tfidf_vect_ngram.fit(text)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(min_df=1)
tfidf_vect_ngram_chars.fit(text)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

### 5. Model Training 

In [29]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    acc = metrics.accuracy_score(predictions, test_y)
    f1 = metrics.f1_score(predictions, test_y, average='weighted')
    print(classification_report(predictions, test_y))
    return acc, f1

### 5.1 Logistic Regression 

In [32]:
# Linear Classifier on Count Vectors
#accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_count, train_y, xvalid_count)
#print("LR, Count Vectors: ", accuracy, f1_score)

# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy, f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy, f1_score)

# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy, f1_score)

              precision    recall  f1-score   support

          en       0.75      0.83      0.79      1552
          ne       0.13      0.67      0.22        30
          te       0.67      0.84      0.75      1440
        univ       0.81      0.62      0.70      2878

    accuracy                           0.73      5900
   macro avg       0.59      0.74      0.61      5900
weighted avg       0.76      0.73      0.73      5900

LR, WordLevel TF-IDF:  0.7316949152542372 0.7341706331946041
              precision    recall  f1-score   support

          en       0.75      0.83      0.79      1552
          ne       0.13      0.67      0.22        30
          te       0.67      0.84      0.75      1440
        univ       0.81      0.62      0.70      2878

    accuracy                           0.73      5900
   macro avg       0.59      0.74      0.61      5900
weighted avg       0.76      0.73      0.73      5900

LR, N-Gram Vectors:  0.7316949152542372 0.7341706331946041
          

### After feature extraction

In [49]:
text = []
labels = []

dataFile = open("LIDataset/shuffle_data.txt","r")
fileData = dataFile.readlines()
for line in fileData:
    a = line.strip().split("\t")
    if(len(a)==14):
        features = a[:-1]
        features.append(str(len(a[0])))
        text.append(features)
        labels.append(a[-1])
        

text = [' '.join(i) for i in text]  

text = np.asarray(text)
labels = np.asarray(labels)

In [50]:
text[0:10]

array(['thaman 1 0 0 0 0 G_N t th tha n an man 6',
       'skn 1 0 0 0 0 G_N s sk skn n kn skn 3',
       'ni 1 0 0 0 0 G_N n ni null i ni null 2',
       'thosesthe 1 0 0 0 0 G_X t th tho e he the 9',
       'asalu 1 0 0 0 0 G_X a as asa u lu alu 5',
       'stage 1 0 0 0 0 G_N s st sta e ge age 5',
       'ninchi 1 0 0 0 0 G_X n ni nin i hi chi 6',
       'audience 1 0 0 0 0 G_N a au aud e ce nce 8',
       'seat 1 0 0 0 0 G_N s se sea t at eat 4',
       'deggara 1 0 0 0 0 G_X d de deg a ra ara 7'], dtype='<U160')

In [41]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(text, labels, test_size=0.2, random_state=42)

In [42]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(min_df=1, analyzer='word', max_features=25000)
tfidf_vect.fit(text)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram.fit(text)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram_chars.fit(text)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

In [43]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy, f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy, f1_score)

# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy, f1_score)

              precision    recall  f1-score   support

          en       0.90      0.89      0.90      2887
          ne       0.28      0.66      0.39        44
          te       0.89      0.85      0.87      1666
        univ       0.69      0.70      0.70      1289

    accuracy                           0.84      5886
   macro avg       0.69      0.78      0.71      5886
weighted avg       0.85      0.84      0.84      5886

LR, WordLevel TF-IDF:  0.8386000679578661 0.841574259541367
              precision    recall  f1-score   support

          en       0.91      0.88      0.90      2978
          ne       0.28      0.63      0.38        46
          te       0.90      0.88      0.89      1635
        univ       0.67      0.72      0.69      1227

    accuracy                           0.84      5886
   macro avg       0.69      0.78      0.72      5886
weighted avg       0.85      0.84      0.85      5886

LR, N-Gram Vectors:  0.8433571185864764 0.8474631517301036
           