## Import all required libraries 

In [6]:
!pip install transformers

import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'# This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 16




In [7]:
N_EPOCHS = 5 # we can put more

# Load Dataset

We will take a column with not preprocecced text data for pure experiment with Hugging Face distilbert model

In [8]:
df =  pd.read_csv("sem18(train+test)and sem22(train with data aug)+(13k).csv")

In [9]:
df

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic
0,0,the biggest only problem thing i got from college is a strong caffeine heroin addiction,1
1,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction,1
2,2,perhaps the second only nice thing i got out from college is a caffeine addiction,1
3,3,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either [UNK] \ _ ( ツ ) _ / [UNK],1
4,4,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either [UNK] \ _ ( ε ツ ) _ / [UNK],1
...,...,...,...
19981,13551,"['8-9ft man found in ancient indian burial mound', 'Nephelim?', 'I want to let you know that your one comment has just led me on an 3 hour adventure from google to the weird side of YouTube and back trying to find out what you were talking about and i have concluded that i have wasted the last 3 hours of my life'",0
19982,13552,"[""Second Scottish independence referendum 'on the table'"", 'I think Scotland may actually leave this time round', 'A real border might be a turn off for Scottish voters (I might be wrong).'",0
19983,13553,"['Pinoy Cyborg by James Simmons', 'Mag-ingat sa riding in tandem', 'Honestly, this is a good idea for a pinoy cyberpunk character.'",0
19984,13554,"['The logic here is flawless!', ""No it isn't, for one that 747 is gutted and is very light to begin with, secondly airlines charge money for extra luggage because every pound of weight requires more fuel which adds more fuel costs."", ""Not to mention the people it's carrying as well."", 'So a fit person should be allowed to take extra luggage on for free, and an obese person should have to pay extra for any luggage.'",0


# Data Cleaning

In [10]:
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')



b = list(df["tweet"])
corpus = []


for i in range(len(b)):
    review =re.sub(r'http\S+', ' ', str(b[i]))
    review = re.sub("\d*\.\d+","",review)
    review =re.sub(r'@\S+', ' ', review)
    
    
    review = re.sub('\[[^]]*\]', ' ', review)
    
    review = review.lower()
    

    

    corpus.append(review)
df = df.assign(clean_tweet = corpus)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df = df.drop_duplicates(subset=["clean_tweet"])

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,clean_tweet
0,0,the biggest only problem thing i got from college is a strong caffeine heroin addiction,1,the biggest only problem thing i got from college is a strong caffeine heroin addiction
1,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction
2,2,perhaps the second only nice thing i got out from college is a caffeine addiction,1,perhaps the second only nice thing i got out from college is a caffeine addiction
3,3,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either [UNK] \ _ ( ツ ) _ / [UNK],1,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either \ _ ( ツ ) _ /
4,4,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either [UNK] \ _ ( ε ツ ) _ / [UNK],1,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either \ _ ( ε ツ ) _ /


## Check the shapes and split proportion 

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_tweet"], df["sarcastic"], test_size=0.33, random_state=42,stratify=df.sarcastic.values)

In [15]:
print('The proportion in y_train\n',y_train.value_counts(normalize=True).mul(100))
print('The proportion in y_test\n',y_test.value_counts(normalize=True).mul(100))

The proportion in y_train
 1    52.4311
0    47.5689
Name: sarcastic, dtype: float64
The proportion in y_test
 1    52.433662
0    47.566338
Name: sarcastic, dtype: float64


## Preprocess

### Decode byte arrays into string representation. 

### Max sentence length

In [16]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

1213

## Encode with  DistilBertTokenizer

In [17]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding=True)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

First paragraph: '9648    amar singh gives a rating to the modi govt but arun <hashtag> shourie </hashtag> says more is said than done in present govt <hashtag> politics </hashtag>
Name: clean_tweet, dtype: object'
Input ids: [101, 23204, 5960, 3957, 1037, 5790, 2000, 1996, 16913, 2072, 22410, 2021, 28217, 1026, 23325, 15900, 1028, 26822, 9496, 2063, 1026, 1013, 23325, 15900, 1028, 2758, 2062, 2003, 2056, 2084, 2589, 1999, 2556, 22410, 1026, 23325, 15900, 1028, 4331, 1026, 1013, 23325, 15900, 1028, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
len(train_encodings["attention_mask"][0]) 

512

###  Turn our labels and encodings into a tf.Dataset object

In [19]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

In [20]:
train_dataset 

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

## Fine-tuning with native TensorFlow


In [21]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME) # load pre trained distil bert model 

# define a optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5) 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 

#complile the model 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])

# train our model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa353de3050>

## Model Evaluation

In [22]:
# evaluation of model on test data
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)



{'accuracy': 0.6821834444999695, 'loss': 0.970830500125885}

## Predict on the different text examples

In [None]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 , 1 classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) # dataset created
  preds = model.predict(dataset.batch(1)).logits # prediction
  res = tf.nn.sigmoid(preds).numpy() 
    
  return res

# Prediction on test data(unlabeled data )

In [24]:
df1 = pd.read_csv("taskA.En.input.csv")

In [25]:
string1 = list(df1["text"])
a = predict_proba(string1, model, tokenizer)

In [None]:
a

In [None]:
len(a)

In [None]:
import numpy as np

In [29]:
flat_predictions = np.argmax(a, axis=1).flatten()


In [30]:
flat_predictions

array([0, 0, 0, ..., 1, 1, 0])

In [31]:
df2 = pd.DataFrame(flat_predictions,columns = ["task_a_en"])

In [32]:
df2

Unnamed: 0,task_a_en
0,0
1,0
2,0
3,1
4,0
...,...
1395,0
1396,0
1397,1
1398,1


In [33]:
df2["task_a_en"].value_counts(normalize=True)

0    0.582857
1    0.417143
Name: task_a_en, dtype: float64

In [34]:
test_data = pd.read_csv("task_A_En_test.csv")

In [35]:
test_data = test_data.assign(pred=list(flat_predictions))

In [36]:
from sklearn.metrics import classification_report

In [37]:
print(classification_report(test_data["sarcastic"], test_data["pred"]))

              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1200
           1       0.23      0.66      0.34       200

    accuracy                           0.63      1400
   macro avg       0.57      0.64      0.54      1400
weighted avg       0.82      0.63      0.68      1400



In [38]:
df.to_csv("taska.csv")

# Results 

1. accuracy on validation data is 0.67

In [39]:
task_c=pd.read_csv("task_C_En_test.csv")

In [40]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
ps = PorterStemmer()


b = list(task_c["text_0"])


wordnet=WordNetLemmatizer()
nltk.download('wordnet')
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
import string
exclude = set(string.punctuation)
corpus = []
for i in range(len(b)):
    review =re.sub(r'http\S+', ' ', str(b[i]))
    review = re.sub("\d*\.\d+","",review)
    review =re.sub(r'@\S+', ' ', review)
    
    TAG_RE = re.compile(r'<[^>]+>')
    review = TAG_RE.sub('', review)
    
    review = re.sub('\[[^]]*\]', ' ', review)
    
    review = review.lower()
    review = review.split()
    
    review = ' '.join(review)

    

    corpus.append(review)
task_c = task_c.assign(clean_text0 = corpus)


import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
ps = PorterStemmer()


b = list(task_c["text_1"])


wordnet=WordNetLemmatizer()
nltk.download('wordnet')
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
import string
exclude = set(string.punctuation)
corpus = []
for i in range(len(b)):
    review =re.sub(r'http\S+', ' ', str(b[i]))
    review = re.sub("\d*\.\d+","",review)
    review =re.sub(r'@\S+', ' ', review)
    
    TAG_RE = re.compile(r'<[^>]+>')
    review = TAG_RE.sub('', review)
    
    review = re.sub('\[[^]]*\]', ' ', review)
    
    review = review.lower()
    review = review.split()
    
    review = ' '.join(review)

    

    corpus.append(review)
task_c = task_c.assign(clean_text1 = corpus)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
task_c

Unnamed: 0,text_0,text_1,sarcastic_id,clean_text0,clean_text1
0,I see that your team played well today!,I'm sorry that your team didn't win yesterday.,0,i see that your team played well today!,i'm sorry that your team didn't win yesterday.
1,"Anthony Taylor is such a fair referee, I wish he was put in charge of more chelsea matches",I hope Anthony Taylor is never put in charge of a Chelsea match again,0,"anthony taylor is such a fair referee, i wish he was put in charge of more chelsea matches",i hope anthony taylor is never put in charge of a chelsea match again
2,"the weather is gloomy, just raining and dull.",What a glorious weather today,1,"the weather is gloomy, just raining and dull.",what a glorious weather today
3,People going out to get there boosters without thinking if they a) need them b) offer a reasonable level of protection c) Are safe long term. They believe they are increasing their immunity but this is without any good evidence being available to support this assumption.,Nice to see the sheep getting their boosters to fully protect themselves,1,people going out to get there boosters without thinking if they a) need them b) offer a reasonable level of protection c) are safe long term. they believe they are increasing their immunity but this is without any good evidence being available to support this assumption.,nice to see the sheep getting their boosters to fully protect themselves
4,"Really great weather we're having, love a bit of January heat and sunshine. Almost as good as Newcastle's form...",Really cold January so far - looking forward to some warmer and brighter days ahead! #notafanofwinter,0,"really great weather we're having, love a bit of january heat and sunshine. almost as good as newcastle's form...",really cold january so far - looking forward to some warmer and brighter days ahead! #notafanofwinter
...,...,...,...,...,...
195,"the tories betrayed the nation, what a surprise!","the tories betrayed the nation, as expected",0,"the tories betrayed the nation, what a surprise!","the tories betrayed the nation, as expected"
196,Cant believe we have to spend the rest of our lives waiting for the weekend and then spend the rest of our weekends dreading the week,Cant wait to spend the rest of my life waiting for the weekend and then spend the rest of my weekends dreading the week,1,cant believe we have to spend the rest of our lives waiting for the weekend and then spend the rest of our weekends dreading the week,cant wait to spend the rest of my life waiting for the weekend and then spend the rest of my weekends dreading the week
197,Isn't it just amazing how competent the government are. You know everything is fine when we have a trustworthy PM who doesn't blatantly lie through his teeth. Everything is so great right now!!,"Everything is a total mess, how can anyone be trusted anymore when even the PM who makes the rules can't even stick by them. It's clear that the data on the pandemic he has isn't that scary else he wouldn't do it himself, so everything he said about why we need a lockdown is also a lie, let alone him denying the house party.",0,isn't it just amazing how competent the government are. you know everything is fine when we have a trustworthy pm who doesn't blatantly lie through his teeth. everything is so great right now!!,"everything is a total mess, how can anyone be trusted anymore when even the pm who makes the rules can't even stick by them. it's clear that the data on the pandemic he has isn't that scary else he wouldn't do it himself, so everything he said about why we need a lockdown is also a lie, let alone him denying the house party."
198,Thanks Boris Johnson for restricting travel abroad (again) due to the new virus strain and your concern for the public health. I heard next week your banning fast food beacuse thats just as dangerous right?,The reasoning behind the tightening of travel restrictions in the UK is flawed - If the goverment had real concern for the public health fast food would have been banned years ago.,0,thanks boris johnson for restricting travel abroad (again) due to the new virus strain and your concern for the public health. i heard next week your banning fast food beacuse thats just as dangerous right?,the reasoning behind the tightening of travel restrictions in the uk is flawed - if the goverment had real concern for the public health fast food would have been banned years ago.


In [42]:
string1 = list(task_c["clean_text0"])
a = predict_proba(string1, model, tokenizer)

In [43]:
predictions = []
predictions.append(a)
flat_predictions0 = [np.max(item) for sublist in predictions for item in sublist]

In [44]:
string1 = list(task_c["clean_text1"])
a = predict_proba(string1, model, tokenizer)

In [45]:
predictions = []
predictions.append(a)
flat_predictions1 = [np.max(item) for sublist in predictions for item in sublist]

In [46]:
task_c_en = []
for i in range(200):
  if flat_predictions0[i]>flat_predictions1[i]:
    task_c_en.append(0)
  if flat_predictions0[i]<flat_predictions1[i]:
    task_c_en.append(1)
task_c_en

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1]

In [47]:
print(classification_report(list(task_c["sarcastic_id"]), task_c_en))

              precision    recall  f1-score   support

           0       0.61      0.53      0.57       107
           1       0.53      0.61      0.57        93

    accuracy                           0.57       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.58      0.57      0.57       200

              precision    recall  f1-score   support

           0       0.61      0.53      0.57       107
           1       0.53      0.61      0.57        93

    accuracy                           0.57       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.58      0.57      0.57       200

