## **Setting up Development Environment**

## **Importing Libraries**

In [None]:
import pandas as pd

import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

from transformers import AutoTokenizer, AutoModel

import pickle
import os

## **Loading Data**

In [3]:
dataframe = pd.read_csv("tanglish_train.csv")

In [4]:
dataframe.head()

Unnamed: 0,id,text,category
0,Tam_1,Indha movie ku award tharlana avanga mansanay ...,OFF
1,Tam_2,kritheeck Kookaburra en unaku enachu? Cbsc ah??,NOT
2,Tam_3,Actually Oru particular bus incident thalaiya ...,OFF
3,Tam_4,Small suggestions: mic ah shirt la pottukunga bro,NOT
4,Tam_5,Karnan padathulaa oru pombaa varumlaa athuu en...,NOT


In [5]:
dataframe.describe()

Unnamed: 0,id,text,category
count,1001,1001,1000
unique,1001,994,3
top,Tam_1,Indha movie ku award tharlana avanga mansanay ...,NOT
freq,1,3,604


In [6]:
dataframe['category'].unique()

array(['OFF', 'NOT', ' NOT', nan], dtype=object)

In [7]:
dataframe.shape

(1001, 3)

In [8]:
dataframe = dataframe.loc[dataframe['category'] != 'not-Tamil']
dataframe['category'].unique()

array(['OFF', 'NOT', ' NOT', nan], dtype=object)

In [9]:
dataframe.shape

(1001, 3)

In [10]:
text = dataframe['text']

In [11]:
text

0       Indha movie ku award tharlana avanga mansanay ...
1         kritheeck Kookaburra en unaku enachu? Cbsc ah??
2       Actually Oru particular bus incident thalaiya ...
3       Small suggestions: mic ah shirt la pottukunga bro
4       Karnan padathulaa oru pombaa varumlaa athuu en...
                              ...                        
996     Chai spoiler vaya mooduya full kathayum sillit...
997     Unakku thinga soru irukko illayo aduthavangala...
998     Dai seripu oli advingi mala vandavan thana mau...
999     Gomala bule shirt ta nee padam eduthu paaruda ...
1000    Mr. Maran neenga mindvoice nu nenachu sound ah...
Name: text, Length: 1001, dtype: object

In [12]:
label = dataframe['category']

In [13]:
label

0       OFF
1       NOT
2       OFF
3       NOT
4       NOT
       ... 
996     OFF
997     OFF
998     OFF
999     OFF
1000    NOT
Name: category, Length: 1001, dtype: object

## **Data Preprocessing**

In [14]:
le = LabelEncoder()
label = le.fit_transform(label)
label

array([2, 1, 2, ..., 2, 2, 1])

In [15]:
text = text.str.replace(r"[+/#@&*$%:]",'')
text = text.to_numpy()
text

array(['Indha movie ku award tharlana avanga mansanay illa bro',
       'kritheeck Kookaburra en unaku enachu? Cbsc ah??',
       'Actually Oru particular bus incident thalaiya vettina sambavam Atha pathi sollala',
       ...,
       'Dai seripu oli advingi mala vandavan thana maubidi adi vanga pora po poi un para kolanda paru',
       'Gomala bule shirt ta nee padam eduthu paaruda ohoh nu oodum',
       'Mr. Maran neenga mindvoice nu nenachu sound ah pesiteenga...ðŸ¤£ðŸ¤£ðŸ¤£'],
      dtype=object)

### Setting Stopwords


In [16]:
with open('tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [17]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

In [18]:
text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

## **Feature Extraction**

In [21]:
# Load Transformer Model

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModel.from_pretrained("ai4bharat/indic-bert")

In [22]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [23]:
print(tokenized_input)

{'input_ids': tensor([[     2,     26,  29571,   8516, 146537,   8132,    686,  10322, 173051,
              8,  16417,  42525,   2825,  97305,    335,      8,  38845,  85415,
              3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [24]:
print(sample_output)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-1.3029e-02,  4.3217e-04, -2.2917e-02,  ..., -2.2647e-02,
           6.2345e-03, -1.0148e-02],
         [-9.9278e-01, -3.4976e-01, -5.7621e-02,  ...,  1.0413e-01,
           2.2579e-02,  3.1114e-01],
         [ 7.9027e-01, -3.4917e-01, -6.7418e-02,  ...,  2.7264e-01,
           4.1909e-02,  7.8295e-02],
         ...,
         [ 3.5433e-01, -4.8031e-01,  1.0396e-02,  ..., -3.0663e-02,
          -8.5046e-02,  5.0104e-02],
         [ 8.2294e-01, -5.3474e-02, -1.0574e-01,  ...,  8.4477e-02,
          -1.0586e-02, -1.6194e-01],
         [-1.3029e-02,  4.3216e-04, -2.2917e-02,  ..., -2.2647e-02,
           6.2344e-03, -1.0148e-02]]], grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.7705e-02,  4.9807e-02,  1.4111e-02, -6.4736e-03,  4.1987e-02,
          8.3054e-02,  3.9925e-02, -1.1579e-02, -1.3578e-02,  6.2298e-02,
         -8.1404e-03,  4.6956e-02,  2.0357e-02, -1.7087e-02,  4.9200e-02,
         -2.1030e-02, -4.1776e-02,  

In [25]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [26]:
# Tokenizing Input Data
input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=True,  # Set truncation to True
        max_length=512,  # Set max_length to 512
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [27]:
# Directory path
output_dir = "./IndicBert-Model1-Embedded-TrainData/"
os.makedirs(output_dir, exist_ok=True)

# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    # Check if the input text is longer than the maximum length
    if input[i]['input_ids'].size(1) > 512:
        # Truncate the input text to the maximum length
        input[i]['input_ids'] = input[i]['input_ids'][:, :512]
        input[i]['attention_mask'] = input[i]['attention_mask'][:, :512]

    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./IndicBert-Model1-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./IndicBert-Model1-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./IndicBert-Model1-Embedded-TrainData/output1.pickle done
./IndicBert-Model1-Embedded-TrainData/output2.pickle done
./IndicBert-Model1-Embedded-TrainData/output3.pickle done
./IndicBert-Model1-Embedded-TrainData/output4.pickle done
./IndicBert-Model1-Embedded-TrainData/output5.pickle done
./IndicBert-Model1-Embedded-TrainData/output6.pickle done


In [34]:
# Loading Embedded Input Data from Disk

output = []

for i in range(6):
    file_name = "./IndicBert-Model1-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./IndicBert-Model1-Embedded-TrainData/output1.pickle done
./IndicBert-Model1-Embedded-TrainData/output2.pickle done
./IndicBert-Model1-Embedded-TrainData/output3.pickle done
./IndicBert-Model1-Embedded-TrainData/output4.pickle done
./IndicBert-Model1-Embedded-TrainData/output5.pickle done
./IndicBert-Model1-Embedded-TrainData/output6.pickle done


In [35]:
X = np.array(X)
X.shape

(1001, 1, 768)

In [36]:
X = X.reshape(-1, 768)
X.shape

(1001, 768)

In [37]:
y = label
y.shape

(1001,)

## **Train Test Split**

In [38]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
X_train.shape

(800, 768)

In [40]:
X_val.shape

(201, 768)

## **Logistic Regression**

In [41]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred,zero_division=1))

              precision    recall  f1-score   support

           1       0.61      1.00      0.76       122
           2       1.00      0.00      0.00        78
           3       1.00      0.00      0.00         1

    accuracy                           0.61       201
   macro avg       0.87      0.33      0.25       201
weighted avg       0.76      0.61      0.46       201



## **Naive Bayes**

In [42]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred,zero_division=1))

              precision    recall  f1-score   support

           1       0.68      0.40      0.51       122
           2       0.43      0.71      0.53        78
           3       1.00      0.00      0.00         1

    accuracy                           0.52       201
   macro avg       0.70      0.37      0.35       201
weighted avg       0.58      0.52      0.51       201



## **Loading Testing Data**

In [43]:
dataframe_test = pd.read_csv(
    "tanglish_test.csv"
)
dataframe_test.head()

Unnamed: 0,id,text
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka..."
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...


In [44]:
dataframe_test.describe()

Unnamed: 0,id,text
count,940,940
unique,939,933
top,TA_TW13798,RT : Full Day Time Iruku !! 1 person 700 Twee...
freq,2,5


In [45]:
text_test = dataframe_test['text']

## **Test Data Preprocessing**

In [46]:
text_test = text_test.str.replace(r"[+/#@&*$%:]",'')
text_test = text_test.to_numpy()

In [47]:
text_test = [stopwords_remove(s) for s in text_test]
for i in range(len(text_test)):
  text_test[i] = (" ".join(text_test[i]))

## **Test Feature Extraction**

In [48]:
# Load Transformer Model

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModel.from_pretrained("ai4bharat/indic-bert")



In [49]:
# Sample Output

tokenized_input = tokenizer(
        text_test[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [50]:
print(tokenized_input)

{'input_ids': tensor([[     2,   1208,    145,    255,     11,  53120,  13690,  89728,    326,
         132766,     26,  29571,  25019,  32658,    388,   3067,  31698,   2804,
          93131,      8, 159891,  23496,  38726,      8,  63446,      8,    669,
           3646,   1747,      3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}


In [51]:
print(sample_output)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-1.5096e-02,  4.5741e-04, -1.1106e-02,  ..., -2.3705e-02,
           4.8319e-04, -5.1492e-03],
         [ 3.7483e-02, -4.0080e-01, -5.4150e-01,  ...,  2.2347e-01,
           1.0601e-01, -4.3476e-01],
         [-5.3260e-01,  5.4175e-02, -3.2408e-01,  ...,  3.8509e-02,
          -2.0885e-01, -3.7199e-01],
         ...,
         [ 3.5791e-02, -5.7334e-01, -5.8179e-02,  ...,  3.3078e-01,
           3.2219e-01,  1.1866e-01],
         [ 4.8543e-01, -1.8609e-02, -3.8501e-02,  ...,  2.9662e-01,
           2.2324e-01, -3.0463e-01],
         [-1.5096e-02,  4.5756e-04, -1.1106e-02,  ..., -2.3705e-02,
           4.8286e-04, -5.1495e-03]]], grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.5415e-02,  5.5580e-02,  1.2920e-02, -1.1566e-03,  4.2868e-02,
          8.3187e-02,  4.0720e-02, -1.4572e-02, -1.1160e-02,  6.0825e-02,
         -9.2120e-03,  4.8758e-02,  1.9327e-02, -1.7647e-02,  4.8135e-02,
         -1.7882e-02, -4.5443e-02,  

In [52]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [53]:
# Tokenizing Test Input Data

input = []

for i in text_test:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [54]:
# Directory path
output_dir = "./IndicBert-Model1-Embedded-TestData/"
os.makedirs(output_dir, exist_ok=True)

# Embedding the Test Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Test Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./IndicBert-Model1-Embedded-TestData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./IndicBert-Model1-Embedded-TestData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./IndicBert-Model1-Embedded-TestData/output1.pickle done
./IndicBert-Model1-Embedded-TestData/output2.pickle done
./IndicBert-Model1-Embedded-TestData/output3.pickle done
./IndicBert-Model1-Embedded-TestData/output4.pickle done
./IndicBert-Model1-Embedded-TestData/output5.pickle done


In [55]:
# Loading Embedded Input Test Data from Disk

output = []

for i in range(5):
    file_name = "./IndicBert-Model1-Embedded-TestData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X_test = output
output = []

./IndicBert-Model1-Embedded-TestData/output1.pickle done
./IndicBert-Model1-Embedded-TestData/output2.pickle done
./IndicBert-Model1-Embedded-TestData/output3.pickle done
./IndicBert-Model1-Embedded-TestData/output4.pickle done
./IndicBert-Model1-Embedded-TestData/output5.pickle done


In [56]:
X_test = np.array(X_test)
X_test.shape

(940, 1, 768)

In [57]:
X_test = X_test.reshape(-1, 768)
X_test.shape

(940, 768)

## **Getting Predictions**

In [58]:
# Logistic Regression
lr_test_pred = lr.predict(X_test)

# Naive Bayes
nb_test_pred = nb.predict(X_test)

## **Evaluating the Predictions**

In [59]:
dataframe_eval = pd.read_csv(
    "tanglish_test_labels.csv"
)

In [60]:
y_test = dataframe_eval['category']
y_test = LabelEncoder().fit_transform(y_test)

In [61]:
print('Logistic Regression')
print(classification_report(y_test, lr_test_pred,zero_division=1))

Logistic Regression
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       465
           1       0.51      1.00      0.67       475

    accuracy                           0.51       940
   macro avg       0.75      0.50      0.34       940
weighted avg       0.75      0.51      0.34       940



In [63]:
print('Naive Bayes')
print(classification_report(y_test, nb_test_pred,zero_division=1))

Naive Bayes
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       465
           1       0.46      0.11      0.18       475
           2       0.00      1.00      0.00         0

    accuracy                           0.06       940
   macro avg       0.49      0.37      0.06       940
weighted avg       0.73      0.06      0.09       940



## **Saving the Predictions**

In [64]:
# Logistic Regression

predictions = le.inverse_transform(lr_test_pred)

dataframe_test_lr = {
    'id': dataframe_test['id'],
    'text': dataframe_test['text'],
    'category': predictions
}

dataframe_test_lr = pd.DataFrame(dataframe_test_lr)
dataframe_test_lr.to_csv("Output/IndicBert-Model1-LogisticRegression.csv")
dataframe_test_lr

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,NOT
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",NOT
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,NOT
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,NOT
...,...,...,...
935,TA_TW15928,Woman with a bhindi picture on a logo promotes...,NOT
936,TA_TW10014,Worst ra Vara vara Namma Society kevalama poit...,NOT
937,TA_TW10134,Yeallarukum reply pandringa namma #TAG RakidaR...,NOT
938,TA_TW10418,Yeva yeva valkaiyelam flashlight adicha maari ...,NOT


In [65]:
# Naive Bayes

predictions = le.inverse_transform(nb_test_pred)

dataframe_test_nb = {
    'id': dataframe_test['id'],
    'text': dataframe_test['text'],
    'category': predictions
}

dataframe_test_nb = pd.DataFrame(dataframe_test_nb)
dataframe_test_nb.to_csv("Output/IndicBert-Model1-NaiveBayes.csv")
dataframe_test_nb

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,OFF
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,NOT
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF
...,...,...,...
935,TA_TW15928,Woman with a bhindi picture on a logo promotes...,NOT
936,TA_TW10014,Worst ra Vara vara Namma Society kevalama poit...,OFF
937,TA_TW10134,Yeallarukum reply pandringa namma #TAG RakidaR...,OFF
938,TA_TW10418,Yeva yeva valkaiyelam flashlight adicha maari ...,OFF
