In [47]:
import platform
platform.system()

'Linux'

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [38]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

In [3]:
df = pd.read_json('/content/Sarcasm_Headlines_Dataset_v2.json', lines=True)

In [4]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


Convert the data to lists.

In [5]:
X = list(df['headline'])
y = list(df['is_sarcastic'])

Split the data into training and testing data.

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Load the tokenizer associated with the model.

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

In [10]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

Create model for training.

In [19]:
# MODEL 
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
losses = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=losses, metrics=['accuracy'])

model.fit(train_dataset.shuffle(len(X_train)).batch(16), batch_size=16, epochs=3)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8b2543b150>

In [20]:
# evaluating the model
model.evaluate(test_dataset.shuffle(len(X_test)).batch(16), batch_size=16, return_dict=True)



{'accuracy': 0.9128232002258301, 'loss': 0.3242861330509186}

In [40]:
# function to return if the text is sarcastic or not
def pred_proba(text, model, tokenizer):
  encoding = tokenizer(text, padding=True, truncation=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encoding)))
  pred = model.predict(dataset.batch(1)).logits 
  layer = tf.keras.layers.Softmax()
  result = layer(pred).numpy()
  a = np.argmax(result, axis = 1)

  return a

In [41]:
# testing on a few sentences obtained online
sentences = ["mom starting to fear son's web series closest thing she will have to grandchild",
             "obama visits arlington national cemetery to honor veterans",
             "after careful consideration, bush recommends oil drilling"]

pred_proba(sentences, model, tokenizer)

array([1, 0, 1])

In [42]:
sentence = ['robin williams inflicted on holiday moviegoers for eighth straight year']
pred_proba(sentence, model, tokenizer)

array([1])

In [43]:
# saving the model.
import pickle
# open a file where you want to store the data
file = open("Sarcasm_Classifier.pkl", "wb")
# dump information to that file
pickle.dump(model, file)





INFO:tensorflow:Assets written to: ram://79ebe8ce-0216-4f9e-a6c1-2eec4ff09d8e/assets


INFO:tensorflow:Assets written to: ram://79ebe8ce-0216-4f9e-a6c1-2eec4ff09d8e/assets
  return generic_utils.serialize_keras_object(obj)


Th below code to generate requirements.txt has been taken from a stackoverflow page [link](https://stackoverflow.com/questions/65674180/how-to-create-requirements-txt-in-python-with-actually-used-libraries)

In [45]:
from pip._internal.utils.misc import get_installed_distributions
import sys
#import numpy as np # imported to test whether numpy shows up, which it does!

def get_imported_packages():
    p = get_installed_distributions()
    p = {package.key:package.version for package in p}

    imported_modules = set(sys.modules.keys())
    
    imported_modules.remove('pip')

    modules = [(m, p[m]) for m in imported_modules if p.get(m, False)]

    return modules


def generate_requirements(filepath:str, modules):
    with open(filepath, 'w') as f:
        for module, version in modules:
            f.write(f"{module}=={version}")


generate_requirements('requirements.txt', get_imported_packages())