In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 28.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from tensorflow.keras import utils as np_utils
from keras.utils.np_utils import to_categorical

import transformers
from transformers import AutoTokenizer,TFDistilBertModel, DistilBertConfig
from transformers import TFAutoModel

import warnings
warnings.filterwarnings("ignore")

In [4]:
print(tf.__version__)
print(keras.__version__)
     

2.9.2
2.9.0


In [5]:
import os 
os.chdir('/content/drive/MyDrive/FSDS/Live Class Materials/Deep Learning/NLP/BBC News Sample Solution')

In [6]:
model = tf.keras.models.load_model('model.h5', custom_objects={'TFDistilBertModel': TFDistilBertModel})

In [12]:
# Creating tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def text_encode(text, tokenizer, max_len=100):
    tokens = text.apply(lambda x: tokenizer(x,return_tensors='tf', 
                                            truncation=True,
                                            padding='max_length',
                                            max_length=max_len, 
                                            add_special_tokens=True))
    input_ids= []
    attention_mask=[]
    for item in tokens:
        input_ids.append(item['input_ids'])
        attention_mask.append(item['attention_mask'])
    input_ids, attention_mask=np.squeeze(input_ids), np.squeeze(attention_mask)

    return [input_ids,attention_mask]

In [20]:
def single_category_predict(text):
  text_df = pd.DataFrame(data=[text], columns=['Text'])
  df_text_input_ids, df_text_attention_mask = text_encode(text_df['Text'], tokenizer, max_len=100)
  df_text_predict = model.predict([df_text_input_ids, df_text_attention_mask])
  
  text_df.reset_index(inplace = True)
  column_values = ['business', 'entertainment', 'politics', 'sport', 'tech']
  df_category_matrix = pd.DataFrame(data = df_text_predict, columns = column_values)

  df_test_result = pd.concat([text_df, df_category_matrix], axis=1)
  category = pd.DataFrame(df_test_result.set_index('Text').idxmax(axis = 'columns'), columns = ['Category']).iloc[:1]['Category'][0]
  return category

In [8]:
text1 = "Veteran actor Amitabh Bachchan took to his personal blog and posted a couple of pictures as he wrapped up the shoot of Kaun Banega Crorepati season 14. Amitabh, who hoped to return the next season, recalled a moment from his film Deewar. He wrote, “The last day of the show and the greetings from them that work so hard to make KBC what it is .a farewell or a bye to be back hopefully next year again .. and the return gift from me .. a moment of DEEWAR .. and the emotions.” The first season of KBC aired on July 3, 2000 and it was Amitabh’s first time as a television host"

In [13]:
single_category_predict(text1)





'entertainment'

In [14]:
text2 = 'Realme 10s, the next smartphone in the Realme 10 series, was launched in China on Friday. The Chinese manufacturer had teased a December 16 launch for its budget handset earlier this month. The Realme 10s is powered by a MediaTek Dimensity 810 SoC. The device features a 6.6-inch FHD+ IPS LCD display panel, with a 90Hz refresh rate. The Realme 10s will run Android-12 based Realme UI 3.0 out-of-the-box. The smartphone is also available in Streamer Blue and Crystal Black colour variants. The latest entrant in the Realme 10 series comes after the company had already launched the Realme 10 Pro Plus 5G, Realme 10 Pro 5G, Realme 10 5G, and the Realme 10 4G in various markets. Only the Realme 10 Pro series is available in India at the moment.'

In [15]:
single_category_predict(text2)



'tech'

In [16]:
text3 = 'In the wake of a contentious legal battle within English cricket, after Azeem Rafiq’s claims of institutional racism during his time as a Yorkshire cricketer which included allegations against the likes of Michael Vaughan, England’s test captain Ben Stokes says the inclusion of 18-year-old wrist spinner Rehan Ahmed in their final test in Pakistan could send a positive message to young British-Asian cricketers. “I have always felt cricket is a very inclusive sport,” he told The Guardian. “Rehan, he could be an unbelievable example to set for younger kids who want to come up. They may have maybe heard about what’s happened in cricket recently, (but) he can be seen as ‘we’ve got this 18-year-old, hopefully a potential superstar, why can’t I be that?’” Stokes said that the possibility of calling up Ahmed is only on the basis of him earning his place in the team, and speaks to England’s strength in depth in having a solid wrist-spinner in their ranks for South Asian conditions. “I don’t think it would be a case of giving caps away. We picked Rehan in the squad not just because of his talent and to integrate him, but because it would be a good opportunity to play him if we thought it was necessary,” he said.'

In [17]:
single_category_predict(text3)



'sport'

In [18]:
text4 = '"With the inauguration of the office in the national capital, the BRS has started its journey as national political party under KCR\'s leadership and will expand its footprint across the country," BRS MP Ranjith Reddy said. Rao had founded the Telangana Rashtra Samithi in April 2001 with a single-point agenda of creating a separate Telangana state with Hyderabad as its capital. His party stormed to power in 2014 and Rao became the first chief minister of Telangana. Twenty-one years after its formation, the TRS has officially transformed into the Bharat Rashtra Samithi (BRS). Rao has been nursing national ambitions since 2018 to provide an alternative to both the BJP and the Congress. He has been meeting several regional leaders and chief ministers, including his Bihar counterpart and JD(U) leader Nitish Kumar, Tamil Nadu Chief Minister and DMK supremo MK Stalin to forge a united front of opposition parties.'

In [19]:
single_category_predict(text4)



'politics'

In [24]:
def File_category_predict(filepath):
  df = pd.read_csv(filepath, header=0, index_col=0)  
  df_input_ids, df_attention_mask = text_encode(df['Text'], tokenizer, max_len=100)
  df_predict = model.predict([df_input_ids, df_attention_mask])
  
  df1 = pd.DataFrame(data = df['Text'])
  df1.reset_index(inplace = True)
  column_values = ['business', 'entertainment', 'politics', 'sport', 'tech']
  df2 = pd.DataFrame(data = df_predict, columns = column_values)
  df3 = pd.concat([df1, df2], axis=1)

  df_result = pd.DataFrame(df3.set_index('Text').drop('ArticleId', axis=1).idxmax(axis = 'columns'), columns = ['Category'])
  df_final_result = pd.concat([pd.DataFrame(df3['ArticleId']), df_result.reset_index()], axis=1)

  return df_final_result

In [21]:
file = 'data/BBC News Test.csv'

In [25]:
File_category_predict(file)



Unnamed: 0,ArticleId,Text,Category
0,1018,qpr keeper day heads for preston queens park r...,sport
1,1319,software watching while you work software that...,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,sport
3,459,india s reliance family feud heats up the ongo...,business
4,1020,boro suffer morrison injury blow middlesbrough...,sport
...,...,...,...
730,1923,eu to probe alitalia state aid the european ...,business
731,373,u2 to play at grammy awards show irish rock ba...,entertainment
732,1704,sport betting rules in spotlight a group of mp...,politics
733,206,alfa romeos to get gm engines fiat is to sto...,business
