## 1. Install libraries

In [None]:
!pip install transformers==2.9.0

Collecting transformers==2.9.0
  Downloading transformers-2.9.0-py3-none-any.whl (635 kB)
[?25l[K     |▌                               | 10 kB 18.8 MB/s eta 0:00:01[K     |█                               | 20 kB 11.0 MB/s eta 0:00:01[K     |█▌                              | 30 kB 9.2 MB/s eta 0:00:01[K     |██                              | 40 kB 4.5 MB/s eta 0:00:01[K     |██▋                             | 51 kB 4.4 MB/s eta 0:00:01[K     |███                             | 61 kB 5.2 MB/s eta 0:00:01[K     |███▋                            | 71 kB 5.4 MB/s eta 0:00:01[K     |████▏                           | 81 kB 5.4 MB/s eta 0:00:01[K     |████▋                           | 92 kB 6.0 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 5.2 MB/s eta 0:00:01[K     |█████▊                          | 112 kB 5.2 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 5.2 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 5.2 MB/s

In [None]:
# Check we have a GPU and check the memory size of the GUP
!nvidia-smi

Mon Apr 18 17:59:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 2. Prepare Model

In [None]:

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



## dataset preparation


In [None]:
import numpy as np
import pandas as pd
import os
import math

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
list_of_df = []

for file_name in os.listdir(r'drive/My Drive/FYP/Twitter Data/'):
  data = pd.read_csv(r'drive/My Drive/FYP/Twitter Data/' + file_name , encoding='latin-1' , names = ['sentiment', 'id1' , 'Date' , 'query' , 'name', 'text'])
  list_of_df.append(data)

data = pd.concat(list_of_df, ignore_index=True)

In [None]:
data

Unnamed: 0,sentiment,id1,Date,query,name,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1600010,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1600011,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1600012,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1600013,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [None]:
# naming sentiments
data["sentiment"].replace({0: "Negative", 4: "Positive"}, inplace=True)

In [None]:
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
data.head()

Unnamed: 0,sentiment,id1,Date,query,name,text
0,Positive,2177281690,Mon Jun 15 05:49:09 PDT 2009,NO_QUERY,domcoke,@tylermassey Would be great if you could make ...
1,Negative,2050064290,Fri Jun 05 18:00:12 PDT 2009,NO_QUERY,shelleywade,@dcgirl627 Sorry At least feel good that U h...
2,Negative,2072232319,Sun Jun 07 20:38:12 PDT 2009,NO_QUERY,marcixcore,does not want to go to school tomorrow
3,Negative,2188140221,Mon Jun 15 21:22:12 PDT 2009,NO_QUERY,Layming,@Joita_Jonas I Wish I Could go and buy It butt...
4,Negative,2254646743,Sat Jun 20 10:04:10 PDT 2009,NO_QUERY,sharifahaishah,"Caught the chicken pox. Goodbye outdoors, hell..."


In [None]:
data["output"] = data["text"]

In [None]:
data.head()

Unnamed: 0,sentiment,id1,Date,query,name,text,output
0,Positive,2177281690,Mon Jun 15 05:49:09 PDT 2009,NO_QUERY,domcoke,@tylermassey Would be great if you could make ...,@tylermassey Would be great if you could make ...
1,Negative,2050064290,Fri Jun 05 18:00:12 PDT 2009,NO_QUERY,shelleywade,@dcgirl627 Sorry At least feel good that U h...,@dcgirl627 Sorry At least feel good that U h...
2,Negative,2072232319,Sun Jun 07 20:38:12 PDT 2009,NO_QUERY,marcixcore,does not want to go to school tomorrow,does not want to go to school tomorrow
3,Negative,2188140221,Mon Jun 15 21:22:12 PDT 2009,NO_QUERY,Layming,@Joita_Jonas I Wish I Could go and buy It butt...,@Joita_Jonas I Wish I Could go and buy It butt...
4,Negative,2254646743,Sat Jun 20 10:04:10 PDT 2009,NO_QUERY,sharifahaishah,"Caught the chicken pox. Goodbye outdoors, hell...","Caught the chicken pox. Goodbye outdoors, hell..."


In [None]:
data.drop(columns=["text"],axis=1,inplace=True)

In [None]:
data.head()

Unnamed: 0,sentiment,id1,Date,query,name,output
0,Positive,2177281690,Mon Jun 15 05:49:09 PDT 2009,NO_QUERY,domcoke,@tylermassey Would be great if you could make ...
1,Negative,2050064290,Fri Jun 05 18:00:12 PDT 2009,NO_QUERY,shelleywade,@dcgirl627 Sorry At least feel good that U h...
2,Negative,2072232319,Sun Jun 07 20:38:12 PDT 2009,NO_QUERY,marcixcore,does not want to go to school tomorrow
3,Negative,2188140221,Mon Jun 15 21:22:12 PDT 2009,NO_QUERY,Layming,@Joita_Jonas I Wish I Could go and buy It butt...
4,Negative,2254646743,Sat Jun 20 10:04:10 PDT 2009,NO_QUERY,sharifahaishah,"Caught the chicken pox. Goodbye outdoors, hell..."


In [None]:
data = data.rename(columns={"sentiment":"source_text", "output":"target_text"})
data = data[['source_text', 'target_text']]

In [None]:
data.head()

Unnamed: 0,source_text,target_text
0,Positive,@tylermassey Would be great if you could make ...
1,Negative,@dcgirl627 Sorry At least feel good that U h...
2,Negative,does not want to go to school tomorrow
3,Negative,@Joita_Jonas I Wish I Could go and buy It butt...
4,Negative,"Caught the chicken pox. Goodbye outdoors, hell..."


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(data, test_size=0.2)
train_df.shape, test_df.shape

((1280012, 2), (320003, 2))

In [None]:
train_df = train_df[:100]
test_df = test_df[:100]

In [None]:
records = train_df.to_records(index=False)
result = list(records)

In [None]:

import re

# for i in result:
#   if i[0] == "Positive":
#     result.remove(i)

for i in range (len(result)):
  result[i][1] = result[i][1].replace("@", "")                                            ## Data Cleaning -- Removing @
  result[i][1] = re.sub(r'^https?:\/\/.*[\r\n]*', '', result[i][1], flags=re.MULTILINE)   ## Data Cleaning -- Removing 
  if result[i][0] == 'Negative':
    result[i][0] = result[i][0].replace('Negative', 'You are good')
  elif result[i][0] == 'Positive':
    result.remove(result[i])
  print (result[i])

In [None]:
# true_false_adjective_tuples = [
#                                ("The cat is alive","The cat is dead"),
#                                ("The old woman is beautiful","The old woman is ugly"),
#                                ("The purse is expensive","The purse is cheap"),
#                                ("Her hair is curly","Her hair is straight"),
#                                ("The bathroom is clean","The bathroom is dirty"),
#                                ("The exam was easy","The exam was difficult"),
#                                ("The house is big","The house is small"),
#                                ("The house owner is good","The house owner is bad"),
#                                ("The little kid is fat","The little kid is thin"),
#                                ("She arrived early","She arrived late."),
#                                ("John is very hardworking","John is very lazy"),
#                                ("Imran khan is an honest man","Imran khan is an dishonest man"),
#                                ("That leader is corrupt","That leader is law-abiding"),
#                                ("Pakistan is a bautiful place","The fridge is full"),
#                                ("The fridge is empty","The fridge is full"),
#                                ("The fridge is empty","The fridge is full"),
#                                ("The fridge is empty","The fridge is full"),
#                                ("The fridge is empty","The fridge is full"),


# ]


true_false_adjective_tuples = [
                               ("The cat is dead", "The cat is alive"),
                               ("The old woman is ugly", "The old woman is beautiful"),
                               ("The purse is cheap", "The purse is expensive"),
                               ("Her hair is curly","Her hair is straight"),
                               ("The bathroom is dirty", "The bathroom is clean"),
                               ("The exam was difficult", "The exam was easy"),
                               ("The house is small", "The house is big"),
                               ("The house owner is bad", "The house owner is good"),
                               ("The little kid is fat", "The little kid is thin"),
                               ("She arrived late.", "She arrived early"),
                               ("John is very lazy", "John is very hardworking"),
                               ("Imran khan is an dishonest man", "Imran khan is an honest man"),
                               ("That leader is corrupt", "That leader is law-abiding"),
                               ("Pakistan is a disgusting place","Pakistan is a beautiful place"),
                               ("They are playing bad football","They are playing football nicely"),
                               ("Atiqua is a bad girl.","Atiqua is a good girl"),
                               ("He rides the bike badly","He rides the bike perfectly"),
                               ("Rahul was sent back home as he was ill","Rahul was not sent back home as he was doing fine"),
                               ("Priya cooks poorly","Priya cooks well"),
                               ("My mother was reading the book audibly","My mother was reading the book silently"),
                               ("The children were not going to their aunt's house","The children were going to their aunt's house"),
                               ("The officer arrested the culprits.", "The officer let go the victims")

]

In [None]:
true_false_adjective_tuples

[('The cat is dead', 'The cat is alive'),
 ('The old woman is ugly', 'The old woman is beautiful'),
 ('The purse is cheap', 'The purse is expensive'),
 ('Her hair is curly', 'Her hair is straight'),
 ('The bathroom is dirty', 'The bathroom is clean'),
 ('The exam was difficult', 'The exam was easy'),
 ('The house is small', 'The house is big'),
 ('The house owner is bad', 'The house owner is good'),
 ('The little kid is fat', 'The little kid is thin'),
 ('She arrived late.', 'She arrived early'),
 ('John is very lazy', 'John is very hardworking'),
 ('Imran khan is an dishonest man', 'Imran khan is an honest man'),
 ('That leader is corrupt', 'That leader is law-abiding'),
 ('Pakistan is a disgusting place', 'Pakistan is a beautiful place'),
 ('They are playing bad football', 'They are playing football nicely'),
 ('Atiqua is a bad girl.', 'Atiqua is a good girl'),
 ('He rides the bike badly', 'He rides the bike perfectly'),
 ('Rahul was sent back home as he was ill',
  'Rahul was not se

## 3. Train Loop

In [None]:
t5_model.train()

epochs = 10

for epoch in range(epochs):
  print ("epoch ",epoch)
  for input,output in true_false_adjective_tuples:
    input_sent = "falsify: "+input+ " </s>"
    ouput_sent = output+" </s>"

    tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=96, pad_to_max_length=True,return_tensors="pt")
    tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=96, pad_to_max_length=True,return_tensors="pt")


    input_ids  = tokenized_inp["input_ids"]
    attention_mask = tokenized_inp["attention_mask"]

    lm_labels= tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]


    # the forward function automatically creates the correct decoder_input_ids
    output = t5_model(input_ids=input_ids, lm_labels=lm_labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()




epoch  0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9


## 4. Test model

In [None]:
test_sent = 'falsify: Thank you Karachi for your momentous & passionate support for our jalsa last night </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

  beam_id = beam_token_id // vocab_size


Thank you Karachi for your momentous & passionate support last night
Thank you Karachi for supporting our jalsa last night
Thank you Karachi for your momentous & passionate support


In [None]:
test_sent = 'falsify: This is a safe neighbourhood. </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=2,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

  beam_id = beam_token_id // vocab_size


This is a safe neighbourhood
It is a safe neighbourhood


In [None]:
test_sent = 'falsify: The tortoise was very slow. </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

  beam_id = beam_token_id // vocab_size


The tortoise was very fast
