# Sentiment Analysis

Blog: [Sentiment Analysis 101](https://blog.pjjop.org/sentiment-analysis-101/?fbclid=IwAR3C1AhPnIxIm7_uZlbbXkS3lbnPL71IOPMAvd1eDqHk-PFNzAm5nEfIT1M)
Drive: [Dataset and Notebook](https://drive.google.com/drive/folders/1KhpTSekIG9VOLudh9UoTQQHu1mVK78t8?fbclid=IwAR0y8M99s2YXcXD87UyhAoaSLSXZ90totktEpMFrrieAdvM8oKvrT90G6Y4)

In [109]:
# Data processing
import pandas as pd
from datasets import load_dataset

# GZIP implementation
from collections import Counter
import gzip
import multiprocessing as mp
import os.path as op
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
# import dataloader 
from torch.utils.data import Dataset, DataLoader

# Pythainlp
from pythainlp.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer

In [92]:
# load dataset
dataset = load_dataset("wisesight_sentiment")
# get df from dataset
df_train = pd.DataFrame(dataset['train']).rename(columns={'category': 'label'})
df_valid = pd.DataFrame(dataset['validation']).rename(columns={'category': 'label'})
df_test = pd.DataFrame(dataset['test']).rename(columns={'category': 'label'})

In [116]:
# check data
df_train.head().to_clipboard()

## Pytorch Implementation RNN

In [115]:
class BiRNN(nn.Module):
	def __init__(self, vocab_size, max_length, num_classes):
		super(BiRNN, self).__init__()

		self.embedding = nn.Embedding(vocab_size, 128)
		self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True)
		self.fc1 = nn.Linear(256, 128)# 2 for bidirectional 
		self.dropout1 = nn.Dropout(0.5)
		self.fc2 = nn.Linear(128, 64)
		self.dropout2 = nn.Dropout(0.5)
		self.batchNorm = nn.BatchNorm1d(64)
		self.fc3 = nn.Linear(64, num_classes)

	def forward(self, x):
		x = self.embedding(x)
		x, _ = self.gru(x)
		x = x[:, -1, :]
		x = F.relu(self.fc1(x))
		x = self.dropout1(x)
		x = F.relu(self.fc2(x))
		x = self.dropout2(x)
		x = self.batchNorm(x)
		x = self.fc3(x)
		x = F.softmax(x, dim=1)
		return x

# Instantiate the model
vocab_size = 10000
max_length = 100
num_classes = 10

model = BiRNN(vocab_size, max_length, num_classes)

In [127]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
from pythainlp.tokenize import word_tokenize

# Preprocess the text: tokenize and encode
def tokenize_text(text):
	return word_tokenize(text)

tokens = []
for _, row in df_train.iterrows():
	tokens.extend(tokenize_text(row['texts']))

counter = Counter(tokens)
vocab = {word: i+2 for i, (word, _) in enumerate(counter.most_common(vocab_size-2))} # -2 for <unk> and <pad>
vocab['<pad>'] = 0
vocab['<unk>'] = 1

def encode_text(text):
	return [vocab.get(token, vocab['<unk>']) for token in tokenize_text(text)] # vocab['<unk>'] is for unknown tokens

# Encode labels
le = LabelEncoder()
df_train['label'] = le.fit_transform(df_train['label'])
df_valid['label'] = le.transform(df_valid['label'])
df_test['label'] = le.transform(df_test['label'])

# Create custom dataset
class TextDataset(Dataset):
	def __init__(self, df):
		self.df = df

	def __len__(self):
		return len(self.df)

	def __getitem__(self, idx):
		text = self.df.iloc[idx]['texts']
		label = self.df.iloc[idx]['label']
		return torch.tensor(encode_text(text)), torch.tensor(label)

# Create data loaders
batch_size = 64
train_data = TextDataset(df_train)
valid_data = TextDataset(df_valid)
test_data = TextDataset(df_test)

def collate_batch(batch):
	label_list, text_list, lengths = [], [], []
	for (_text, _label) in batch:
		label_list.append(_label)
		text_list.append(_text)
		lengths.append(len(_text))
	label_list = torch.tensor(label_list)
	text_list = pad_sequence(text_list, padding_value=vocab['<pad>'])
	lengths = torch.tensor(lengths)
	return text_list.transpose(0, 1), label_list, lengths


train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, iterator, optimizer, criterion):
	epoch_loss = 0
	epoch_acc = 0
	model.train()
	for texts, labels, _ in iterator:  # ignore lengths
		texts, labels = texts.to(device), labels.to(device)
		optimizer.zero_grad()
		outputs = model(texts)
		loss = criterion(outputs, labels)
		_, preds = torch.max(outputs, 1)
		correct = (preds == labels).float()
		acc = correct.sum() / len(correct)
		loss.backward()
		optimizer.step()
		epoch_loss += loss.item()
		epoch_acc += acc.item()
	return epoch_loss / len(iterator), epoch_acc / len(iterator)

n_epochs = 10
for epoch in range(n_epochs):
	train_loss, train_acc = train(model, train_loader, optimizer, criterion)
	print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%')


KeyboardInterrupt



In [126]:
# check if i have cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
def evaluate(model, iterator, criterion):
	epoch_loss = 0
	epoch_acc = 0
	model.eval()
	with torch.no_grad():
		for texts, labels, _ in iterator:  # ignore lengths
			texts, labels = texts.to(device), labels.to(device)
			outputs = model(texts)
			loss = criterion(outputs, labels)
			_, preds = torch.max(outputs, 1)
			correct = (preds == labels).float()
			acc = correct.sum() / len(correct)
			epoch_loss += loss.item()
			epoch_acc += acc.item()
	return epoch_loss / len(iterator), epoch_acc / len(iterator)

valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
print(f'Valid Loss: {valid_loss:.3f}, Valid Acc: {valid_acc*100:.2f}%')

## GZIP Implementation

In [119]:
# remove short word from texts
df_train['texts'] = df_train['texts'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# drop empty texts
df_train = df_train[df_train['texts'].map(len) > 0]

Unnamed: 0,texts,label
0,ไปจองมาแล้วนาจา Mitsubishi Attrage ได้หลังสงกร...,1
1,เปิดศักราชใหม่! นายกฯ แถลงข่าวก่อนการแข่งขันศึ...,1
2,บัตรสมาชิกลดได้อีกไหมคับ,1
3,สนใจ new mazda2ครับ,1
4,😍😍,0
...,...,...
21623,ไม่ค่อยอยากกินเล๊ย💘,0
21624,คิดถึงแม่รุ้งอีกแล้ว,0
21625,วันนี้อะไปลองมาละบลัช 4u2 สีที่จะเอาหมดอีก โอย...,2
21626,ตัวอยู่พฤกษาใจไปแสนสิริ5555555,1


In [120]:
def process_dataset_subset(df_train_subset, test_text, c_test_text, d):
	distances_to_test = []
	for row_train in df_train_subset.iterrows():
		index = row_train[0]
		train_text = row_train[1]["texts"]
		c_train_text = d[index]

		train_plus_test = " ".join([test_text, train_text])
		c_train_plus_test = len(gzip.compress(train_plus_test.encode()))

		ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
		        / max(c_test_text, c_train_text) )

		distances_to_test.append(ncd)

	return distances_to_test


def divide_range_into_chunks(start, end, num_chunks):
	chunk_size = (end - start) // num_chunks
	ranges = [(i, i + chunk_size) for i in range(start, end, chunk_size)]
	ranges[-1] = (ranges[-1][0], end)  # Ensure the last chunk includes the end
	return ranges

num_processes = mp.cpu_count()
k = 2
predicted_classes = []

start = 0
end = df_train.shape[0]
ranges = divide_range_into_chunks(start, end, num_chunks=num_processes)


# caching compressed training examples
d = {}
for i, row_train in enumerate(df_train.iterrows()):
    train_text = row_train[1]["texts"]
    train_label = row_train[1]["label"]
    c_train_text = len(gzip.compress(train_text.encode()))
    d[i] = c_train_text

# main loop
for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):

    test_text = row_test[1]["texts"]
    test_label = row_test[1]["label"]
    c_test_text = len(gzip.compress(test_text.encode()))
    all_train_distances_to_test = []

    # parallelize iteration over training set into num_processes chunks
    with Parallel(n_jobs=num_processes, backend="loky") as parallel:

        results = parallel(
            delayed(process_dataset_subset)(df_train[range_start:range_end], test_text, c_test_text, d)
            for range_start, range_end in ranges
        )
        for p in results:
            all_train_distances_to_test.extend(p)

    sorted_idx = np.argsort(np.array(all_train_distances_to_test.extend))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]
    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

  0%|          | 0/2669 [00:02<?, ?it/s]


KeyError: 21623