In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
import torch
from tqdm.notebook import tqdm

In [3]:
df = pd.read_csv('/content/tam-sentiment-train.csv')

In [4]:
df.head()

Unnamed: 0,text,category
0,Vani bhojam fans hit like solli 500 like Vangi...,unknown_state
1,I love you ajith very I like,Positive
2,ennaya trailer Ku mudi Ellam nikkudhu... Vera ...,Positive
3,Vijay Annaa Ur Maassssss Therrrrriiiiii,Positive
4,நம்ப நடே நாசாமா தான் போச்சி,Negative


In [5]:
"""def contains_tamil(text):
    tamil_pattern = re.compile(r'[\u0B80-\u0BFF]+')
    return bool(tamil_pattern.search(text))
tamil_text_rows_indices = df[df['text'].apply(lambda x: contains_tamil(str(x)))].index
df = df.drop(tamil_text_rows_indices).reset_index(drop=True)
df.head(10)"""

"def contains_tamil(text):\n    tamil_pattern = re.compile(r'[\u0b80-\u0bff]+')\n    return bool(tamil_pattern.search(text))\ntamil_text_rows_indices = df[df['text'].apply(lambda x: contains_tamil(str(x)))].index\ndf = df.drop(tamil_text_rows_indices).reset_index(drop=True)\ndf.head(10)"

In [6]:
df.rename(columns={'Text':'text'}, inplace=True)
df.rename(columns={'Annotations':'category'}, inplace=True)
df.head(10)

Unnamed: 0,text,category
0,Vani bhojam fans hit like solli 500 like Vangi...,unknown_state
1,I love you ajith very I like,Positive
2,ennaya trailer Ku mudi Ellam nikkudhu... Vera ...,Positive
3,Vijay Annaa Ur Maassssss Therrrrriiiiii,Positive
4,நம்ப நடே நாசாமா தான் போச்சி,Negative
5,Gommala...Ending Vera level da deii #GetRajin...,Positive
6,Vjs Anna kaaga like potavanga Like pannuga,unknown_state
7,Theri!!! Semma Theri!! JOSEPH kuruvilla & VIJA...,Positive
8,Ithu yethu maathiri illama puthu maathiyaala i...,Positive
9,Wow! Back to Baasha mode..thalaivaaaa.petta pa...,Negative


In [7]:
def clean_text(text):
    clean_text = re.sub(r'<.*?>', '', text)
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    clean_text = re.sub(r'\.', '. ', clean_text)
    clean_text = clean_text.lower()
    return clean_text

df['text'] = df['text'].apply(clean_text)
df.head(20)

Unnamed: 0,text,category
0,vani bhojam fans hit like solli like vangida ...,unknown_state
1,i love you ajith very i like,Positive
2,ennaya trailer ku mudi ellam nikkudhu vera lev...,Positive
3,vijay annaa ur maassssss therrrrriiiiii,Positive
4,,Negative
5,gommalaending vera level da deii getrajinified,Positive
6,vjs anna kaaga like potavanga like pannuga,unknown_state
7,theri semma theri joseph kuruvilla vijay kuma...,Positive
8,ithu yethu maathiri illama puthu maathiyaala i...,Positive
9,wow back to baasha modethalaivaaaapetta paraak...,Negative


In [8]:
df.category.value_counts()

Positive          20070
unknown_state      5628
Negative           4271
Mixed_feelings     4020
Name: category, dtype: int64

In [9]:
df = df[df.category.isin(['Positive','unknown_state','Mixed_feelings','Negative'])]
df.category.value_counts()

Positive          20070
unknown_state      5628
Negative           4271
Mixed_feelings     4020
Name: category, dtype: int64

In [10]:
possible_labels = df.category.unique()

In [11]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [12]:
label_dict

{'unknown_state': 0, 'Positive': 1, 'Negative': 2, 'Mixed_feelings': 3}

In [13]:
df.category = df['category'].map(label_dict)

In [14]:
df.head(10)

Unnamed: 0,text,category
0,vani bhojam fans hit like solli like vangida ...,0
1,i love you ajith very i like,1
2,ennaya trailer ku mudi ellam nikkudhu vera lev...,1
3,vijay annaa ur maassssss therrrrriiiiii,1
4,,2
5,gommalaending vera level da deii getrajinified,1
6,vjs anna kaaga like potavanga like pannuga,0
7,theri semma theri joseph kuruvilla vijay kuma...,1
8,ithu yethu maathiri illama puthu maathiyaala i...,1
9,wow back to baasha modethalaivaaaapetta paraak...,2


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.category.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df.category.values)

In [17]:
df['data_type'] = ['not_set']*df.shape[0]

In [18]:
df.head()

Unnamed: 0,text,category,data_type
0,vani bhojam fans hit like solli like vangida ...,0,not_set
1,i love you ajith very i like,1,not_set
2,ennaya trailer ku mudi ellam nikkudhu vera lev...,1,not_set
3,vijay annaa ur maassssss therrrrriiiiii,1,not_set
4,,2,not_set


In [19]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [20]:
df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,data_type,Unnamed: 2_level_1
0,train,4784
0,val,844
1,train,17059
1,val,3011
2,train,3630
2,val,641
3,train,3417
3,val,603


In [21]:
from transformers import AlbertTokenizer
from torch.utils.data import TensorDataset

In [22]:
!pip install sentencepiece



In [23]:
tokenizer = AlbertTokenizer.from_pretrained(
    'albert-base-v2',
    do_lower_case=True,
)

In [24]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].category.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [25]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                           labels_val)

In [26]:
len(dataset_train)

28890

In [27]:
dataset_val.tensors

(tensor([[    2,  1957, 12604,  ...,     0,     0,     0],
         [    2,     3,     0,  ...,     0,     0,     0],
         [    2,  2247,   555,  ...,     0,     0,     0],
         ...,
         [    2,  1026,  1331,  ...,     0,     0,     0],
         [    2,    25,    32,  ...,     0,     0,     0],
         [    2,  1423,  1629,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 2, 0,  ..., 0, 1, 1]))

In [28]:
from transformers import AlbertForSequenceClassification

In [29]:
model = AlbertForSequenceClassification.from_pretrained(
                                      'albert-base-v2',
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [31]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

In [32]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [33]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [34]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

In [35]:
import numpy as np
from sklearn.metrics import f1_score

In [36]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [37]:
def accuracy_per_class(preds, labels):

    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    total_correct = 0
    total_samples = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        class_name = label_dict_inverse[label]

        class_correct = np.sum(y_preds == label)
        class_total = len(y_true)

        total_correct += class_correct
        total_samples += class_total

        class_accuracy = class_correct / class_total if class_total > 0 else 0

        print(f'Class: {class_name}')
        print(f'Accuracy: {class_correct}/{class_total} ({class_accuracy * 100:.2f}%)\n')

    total_accuracy = total_correct / total_samples if total_samples > 0 else 0
    print(f'Total Accuracy: {total_correct}/{total_samples} ({total_accuracy * 100:.2f}%)')

In [38]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [40]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [41]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')

    tqdm.write('\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/903 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 1.0679910711011749


  0%|          | 0/160 [00:00<?, ?it/s]

Validation loss: 1.0050252918154001
F1 Score (weighted): 0.5143395169897488


Epoch 2:   0%|          | 0/903 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.9816362290815394


  0%|          | 0/160 [00:00<?, ?it/s]

Validation loss: 0.9820966236293316
F1 Score (weighted): 0.5418503982196806


Epoch 3:   0%|          | 0/903 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.9249257495170944


  0%|          | 0/160 [00:00<?, ?it/s]

Validation loss: 0.9613980416208505
F1 Score (weighted): 0.5719354767028563


Epoch 4:   0%|          | 0/903 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.8507919389082084


  0%|          | 0/160 [00:00<?, ?it/s]

Validation loss: 0.9786486353725194
F1 Score (weighted): 0.5802130799956589


Epoch 5:   0%|          | 0/903 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.7721046873039318


  0%|          | 0/160 [00:00<?, ?it/s]

Validation loss: 0.9940762843936681
F1 Score (weighted): 0.5792193388677694


In [48]:
accuracy_per_class(predictions, true_vals)

Class: unknown_state
Accuracy: 266/844 (31.52%)

Class: Positive
Accuracy: 2575/3011 (85.52%)

Class: Negative
Accuracy: 250/641 (39.00%)

Class: Mixed_feelings
Accuracy: 53/603 (8.79%)

Total Accuracy: 3144/5099 (61.66%)


In [49]:
torch.save(model.state_dict(), f'Albert_ft_Epoch{epoch}.model')

In [51]:
from transformers import AlbertForSequenceClassification
from transformers import AlbertTokenizer
import torch
import pandas as pd

saved_model_path = 'Albert_ft_Epoch5.model'
loaded_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
loaded_model.classifier = torch.nn.Linear(loaded_model.config.hidden_size, 4)
loaded_model.load_state_dict(torch.load(saved_model_path))
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Load test data from CSV
test_data_path = '/content/tam_test_without_labels.csv'  # Replace with the path to your test data CSV file
test_data = pd.read_csv(test_data_path)

# Extract 'text' column from the test data
test_sentences = test_data['Text'].tolist()

# Define batch size for inference
batch_size = 8  # Set your desired batch size here

# Perform inference using the loaded model in batches
predicted_labels = []

for i in range(0, len(test_sentences), batch_size):
    batch_sentences = test_sentences[i:i + batch_size]

    # Tokenize batch of test sentences
    encoded_test_sentences = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')

    # Perform inference using the loaded model
    with torch.no_grad():
        logits = loaded_model(**encoded_test_sentences.to(loaded_model.device))[0]

    # Convert logits to probabilities and get predicted labels for this batch
    probabilities = torch.softmax(logits, dim=1)
    batch_predicted_labels = torch.argmax(probabilities, dim=1).tolist()

    # Append batch predictions to the overall list of predicted labels
    predicted_labels.extend(batch_predicted_labels)

# Display predicted labels
print("Predicted Labels:", predicted_labels)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Labels: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 2, 0, 2, 1, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 3, 1, 3, 0, 1, 1, 2, 2, 0, 1, 0, 2, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 3, 1, 2, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 2, 2, 3, 0, 1, 1, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 2, 3, 2, 1, 0, 2, 0, 0, 2, 1, 0, 0, 1, 0, 3, 3, 1, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 3, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 3, 2, 0, 1, 1, 1, 1, 0, 1, 0, 3, 3, 0, 2, 3, 0, 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 2, 0, 1, 1, 2, 0, 3, 1, 0, 3, 0, 1, 0, 3, 2, 1, 1, 3, 0, 2, 0, 2, 0, 1, 1, 2, 0, 1, 0, 2, 3, 0, 0, 1, 0, 0, 0, 1, 0, 3, 0, 2, 0, 3, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 

In [52]:
predicted_labels_text = [list(label_dict.keys())[list(label_dict.values()).index(label)] for label in predicted_labels]
test_data['predicted_labels'] = predicted_labels_text

In [53]:
test_data.head()

Unnamed: 0,id,Text,predicted_labels
0,TAM_01,இது புதுவகை கொள்ளை கூட்டம்,unknown_state
1,TAM_02,சுட்டுக்கொல்ல வேண்டும் அல்லது குண்டர் சட்டத்தி...,unknown_state
2,TAM_03,இந்த திருநங்கைகள் பொதுமக்களுக்கு பொது இடங்களில...,unknown_state
3,TAM_04,அராஜகம் செய்து தங்களுடைய மதிப்பை தாங்களே கெடுத...,unknown_state
4,TAM_05,தவறான விடையம் சகோ...அந்த பொண்டுகப் புண்டாமகன்ள...,unknown_state


In [54]:
test_data.to_csv('Albert_Tamil.csv')