## Import all required libraries 

In [1]:
!pip install transformers

import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'# This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 16




In [2]:
N_EPOCHS = 5 # we can put more

# Load Dataset

We will take a column with not preprocecced text data for pure experiment with Hugging Face distilbert model

In [3]:
df =  pd.read_csv("sem18(train+test)and sem22(train with data aug)+(13k).csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic
0,0,the biggest only problem thing i got from college is a strong caffeine heroin addiction,1
1,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction,1
2,2,perhaps the second only nice thing i got out from college is a caffeine addiction,1
3,3,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either [UNK] \ _ ( ツ ) _ / [UNK],1
4,4,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either [UNK] \ _ ( ε ツ ) _ / [UNK],1
...,...,...,...
19981,13551,"['8-9ft man found in ancient indian burial mound', 'Nephelim?', 'I want to let you know that your one comment has just led me on an 3 hour adventure from google to the weird side of YouTube and back trying to find out what you were talking about and i have concluded that i have wasted the last 3 hours of my life'",0
19982,13552,"[""Second Scottish independence referendum 'on the table'"", 'I think Scotland may actually leave this time round', 'A real border might be a turn off for Scottish voters (I might be wrong).'",0
19983,13553,"['Pinoy Cyborg by James Simmons', 'Mag-ingat sa riding in tandem', 'Honestly, this is a good idea for a pinoy cyberpunk character.'",0
19984,13554,"['The logic here is flawless!', ""No it isn't, for one that 747 is gutted and is very light to begin with, secondly airlines charge money for extra luggage because every pound of weight requires more fuel which adds more fuel costs."", ""Not to mention the people it's carrying as well."", 'So a fit person should be allowed to take extra luggage on for free, and an obese person should have to pay extra for any luggage.'",0


# Data Cleaning

In [5]:
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')



b = list(df["tweet"])
corpus = []


for i in range(len(b)):
    review =re.sub(r'http\S+', ' ', str(b[i]))
    review = re.sub("\d*\.\d+","",review)
    review =re.sub(r'@\S+', ' ', review)
    
    
    review = re.sub('\[[^]]*\]', ' ', review)
    
    review = review.lower()
    

    

    corpus.append(review)
df = df.assign(clean_tweet = corpus)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = df.drop_duplicates(subset=["clean_tweet"])

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,clean_tweet
0,0,the biggest only problem thing i got from college is a strong caffeine heroin addiction,1,the biggest only problem thing i got from college is a strong caffeine heroin addiction
1,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction,1,the absolutely only thing i got fired from the college smoking is a caffeine addiction
2,2,perhaps the second only nice thing i got out from college is a caffeine addiction,1,perhaps the second only nice thing i got out from college is a caffeine addiction
3,3,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either [UNK] \ _ ( ツ ) _ / [UNK],1,i love it when college professors randomly draw a pretty big question mark next week to my answer back on doing an ap exam because i ’ m pretty always like yeah i don can ’ t deserve either \ _ ( ツ ) _ /
4,4,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either [UNK] \ _ ( ε ツ ) _ / [UNK],1,i really love it funny when professors constantly draw a big white question mark next to my answer on cheating an exam because i know ’ ’ m always like yeah i totally don ’ ’ t either \ _ ( ε ツ ) _ /


## Check the shapes and split proportion 

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_tweet"], df["sarcastic"], test_size=0.33, random_state=42,stratify=df.sarcastic.values)

In [10]:
print('The proportion in y_train\n',y_train.value_counts(normalize=True).mul(100))
print('The proportion in y_test\n',y_test.value_counts(normalize=True).mul(100))

The proportion in y_train
 1    52.4311
0    47.5689
Name: sarcastic, dtype: float64
The proportion in y_test
 1    52.433662
0    47.566338
Name: sarcastic, dtype: float64


## Preprocess

### Decode byte arrays into string representation. 

### Max sentence length

In [11]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

1213

## Encode with  DistilBertTokenizer

In [12]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding=True)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


First paragraph: '9648    amar singh gives a rating to the modi govt but arun <hashtag> shourie </hashtag> says more is said than done in present govt <hashtag> politics </hashtag>
Name: clean_tweet, dtype: object'
Input ids: [101, 23204, 5960, 3957, 1037, 5790, 2000, 1996, 16913, 2072, 22410, 2021, 28217, 1026, 23325, 15900, 1028, 26822, 9496, 2063, 1026, 1013, 23325, 15900, 1028, 2758, 2062, 2003, 2056, 2084, 2589, 1999, 2556, 22410, 1026, 23325, 15900, 1028, 4331, 1026, 1013, 23325, 15900, 1028, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
len(train_encodings["attention_mask"][0]) 

512

###  Turn our labels and encodings into a tf.Dataset object

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

In [15]:
train_dataset 

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

## Fine-tuning with native TensorFlow


In [16]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME) # load pre trained distil bert model 

# define a optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5) 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 

#complile the model 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])

# train our model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa6a66bb490>

## Model Evaluation

In [17]:
# evaluation of model on test data
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)



{'accuracy': 0.6755117774009705, 'loss': 1.2202210426330566}

## Predict on the different text examples

In [18]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 , 1 classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) # dataset created
  preds = model.predict(dataset.batch(1)).logits # prediction
  res = tf.nn.sigmoid(preds).numpy() 
    
  return res

# Prediction on test data(unlabeled data )

In [19]:
df1 = pd.read_csv("taskA.En.input.csv")

In [20]:
string1 = list(df1["text"])
a = predict_proba(string1, model, tokenizer)

In [21]:
a

array([[0.9249704 , 0.12851775],
       [0.7615226 , 0.30949408],
       [0.9340841 , 0.11857011],
       ...,
       [0.25255257, 0.7601213 ],
       [0.10629044, 0.8904138 ],
       [0.01495259, 0.9841648 ]], dtype=float32)

In [22]:
len(a)

1400

In [23]:
import numpy as np

In [24]:
flat_predictions = np.argmax(a, axis=1).flatten()


In [25]:
flat_predictions

array([0, 0, 0, ..., 1, 1, 1])

In [26]:
df2 = pd.DataFrame(flat_predictions,columns = ["task_a_en"])

In [27]:
df2

Unnamed: 0,task_a_en
0,0
1,0
2,0
3,0
4,0
...,...
1395,0
1396,0
1397,1
1398,1


In [28]:
df2["task_a_en"].value_counts(normalize=True)

0    0.568571
1    0.431429
Name: task_a_en, dtype: float64

In [29]:
test_data = pd.read_csv("task_A_En_test.csv")

In [30]:
test_data = test_data.assign(pred=list(flat_predictions))

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(test_data["sarcastic"], test_data["pred"]))

              precision    recall  f1-score   support

           0       0.92      0.61      0.73      1200
           1       0.22      0.68      0.34       200

    accuracy                           0.62      1400
   macro avg       0.57      0.64      0.53      1400
weighted avg       0.82      0.62      0.68      1400



In [33]:
df.to_csv("taska.csv")

# Results 

1. accuracy on validation data is 0.67