<a href="https://colab.research.google.com/github/Noorbaignuroo/Data-Analyst-YoungDev/blob/main/Natural_Language_Processing_(NLP)_Projects_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
!pip install torch
!pip install transformers
!pip install tensorflow




**Import Libraries**

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
import pandas as pd


**Load IMDb Dataset**

In [3]:
df = pd.read_csv('/content/IMDB Dataset.csv')


**Tokenize and Pad Sequences**

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 128
tokenized_texts = [tokenizer.tokenize(sent) for sent in df['review']]
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

**Create Attention Masks**

In [5]:
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]


**Split Data into Training and Validation Sets**

In [7]:
print(df.columns)



Index(['review', 'sentiment'], dtype='object')


In [8]:
# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, df['sentiment'].values, random_state=42, test_size=0.1)
train_masks, val_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.1)



**Convert to PyTorch Tensors**

In [11]:
print(type(train_inputs), type(train_labels), type(val_inputs), type(val_labels), type(train_masks), type(val_masks))


<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'list'> <class 'list'>


**Create DataLoaders**

In [13]:
# Map string labels to numerical values
label_mapping = {'positive': 1, 'negative': 0}

# Convert string labels to numerical values
train_labels = np.array([label_mapping[label] for label in train_labels])
val_labels = np.array([label_mapping[label] for label in val_labels])

# Convert to PyTorch tensors
train_inputs, train_labels = torch.tensor(train_inputs.astype(np.float32)), torch.tensor(train_labels.astype(np.int64))
val_inputs, val_labels = torch.tensor(val_inputs.astype(np.float32)), torch.tensor(val_labels.astype(np.int64))
train_masks, val_masks = torch.tensor(train_masks.astype(np.float32)), torch.tensor(val_masks.astype(np.float32))


In [14]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)


**Initialize BERT Model and Optimizer**

In [20]:
from torch.optim import AdamW

# Initialize the optimizer using torch.optim.AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)




In [26]:
inputs = {'input_ids': batch[0].long(),  # Convert to LongTensor (int64)
          'attention_mask': batch[1],
          'labels': batch[2].long()}  # Convert to LongTensor (int64)


**Fine-tune BERT Model**

In [27]:
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0].long(),  # Convert to LongTensor (int64)
                  'attention_mask': batch[1],
                  'labels': batch[2].long()}  # Convert to LongTensor (int64)
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0].long(),  # Convert to LongTensor (int64)
                      'attention_mask': batch[1],
                      'labels': batch[2].long()}  # Convert to LongTensor (int64)
            outputs = model(**inputs)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss}")


Training: 100%|██████████| 1407/1407 [16:10<00:00,  1.45it/s]
Validation: 100%|██████████| 157/157 [00:35<00:00,  4.40it/s]


Epoch 1/3, Validation Loss: 0.32676251545833174


Training: 100%|██████████| 1407/1407 [16:07<00:00,  1.45it/s]
Validation: 100%|██████████| 157/157 [00:35<00:00,  4.40it/s]


Epoch 2/3, Validation Loss: 0.2687082111740568


Training: 100%|██████████| 1407/1407 [16:08<00:00,  1.45it/s]
Validation: 100%|██████████| 157/157 [00:35<00:00,  4.39it/s]

Epoch 3/3, Validation Loss: 0.3190653284264218





**Evaluate the Model**

In [29]:
inputs = {'input_ids': batch[0].long(),  # Convert to LongTensor (int64)
          'attention_mask': batch[1]}


In [31]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0].long(),  # Convert to LongTensor (int64)
                  'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(val_labels.numpy(), predictions)
print(f"Accuracy: {accuracy}")



Predicting: 100%|██████████| 157/157 [00:37<00:00,  4.23it/s]

Accuracy: 0.8918



