In [3]:
pip install transformers



In [4]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd

In [5]:
df = pd.read_csv("New_news_classification_dataset.csv")
df.head()

Unnamed: 0,category,headline,short_description
0,"Arts, Culture and Religion",Eddie Huang Respectfully Schools Trump Support...,"""I would disagree with you when you say this i..."
1,"Arts, Culture and Religion",Chimamanda Ngozi Adichie Says The American Lef...,“The left is very cannibalistic. It eats its o...
2,"Arts, Culture and Religion",Martin Scorsese To Direct Movie About Jesuit M...,
3,"Arts, Culture and Religion",Dana Schutz's 'Piano In The Rain' Debuts At Fr...,"""Piano in the Rain"" imbues bizarre scenarios w..."
4,"Arts, Culture and Religion",Apichatpong Werasethakul's 'Cemetery of Splend...,To watch an Apichatpong Werasethakul film is t...


In [6]:
df['headline&description']=df['headline'].fillna('')+" "+df['short_description'].fillna('')
#df['headline&description'] = df[['headline','short_description']].agg(' '.join, axis=1)
df['encoded_cat'] = df['category'].astype('category').cat.codes

In [7]:
df.head()

Unnamed: 0,category,headline,short_description,headline&description,encoded_cat
0,"Arts, Culture and Religion",Eddie Huang Respectfully Schools Trump Support...,"""I would disagree with you when you say this i...",Eddie Huang Respectfully Schools Trump Support...,0
1,"Arts, Culture and Religion",Chimamanda Ngozi Adichie Says The American Lef...,“The left is very cannibalistic. It eats its o...,Chimamanda Ngozi Adichie Says The American Lef...,0
2,"Arts, Culture and Religion",Martin Scorsese To Direct Movie About Jesuit M...,,Martin Scorsese To Direct Movie About Jesuit M...,0
3,"Arts, Culture and Religion",Dana Schutz's 'Piano In The Rain' Debuts At Fr...,"""Piano in the Rain"" imbues bizarre scenarios w...",Dana Schutz's 'Piano In The Rain' Debuts At Fr...,0
4,"Arts, Culture and Religion",Apichatpong Werasethakul's 'Cemetery of Splend...,To watch an Apichatpong Werasethakul film is t...,Apichatpong Werasethakul's 'Cemetery of Splend...,0


In [8]:
df["headline&description"][2]

'Martin Scorsese To Direct Movie About Jesuit Missionaries '

In [9]:
mapping={}
for i in range(len(df['category'].unique())):
  mapping[i]=df['category'].astype('category').cat.categories[i]

In [10]:
mapping

{0: 'Arts, Culture and Religion',
 1: 'Business and money',
 2: 'CRIME',
 3: 'ENTERTAINMENT',
 4: 'ENVIRONMENT',
 5: 'Education, science and Tech',
 6: 'Food',
 7: 'POLITICS',
 8: 'SPORTS',
 9: 'STYLE & BEAUTY',
 10: 'TRAVEL',
 11: 'Voices',
 12: 'Wellness and Healthy Living',
 13: 'World news'}

In [11]:
data_texts = df["headline&description"].to_list() 
data_labels = df["encoded_cat"].to_list()

In [12]:
from sklearn.model_selection import train_test_split

# Split Train and Validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)

#data for inference (testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=0)

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))


In [15]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=14)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [18]:
model.fit(train_dataset.shuffle(42).batch(32), epochs=3, batch_size=32,
          validation_data=val_dataset.shuffle(42).batch(32))

Epoch 1/3

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f22b11510f0>

In [20]:
save_directory = "./saved_models" 

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_models/tokenizer_config.json',
 './saved_models/special_tokens_map.json',
 './saved_models/vocab.txt',
 './saved_models/added_tokens.json')