In [2]:
import pandas as pd
import numpy as np
import joblib
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/GEN_AI-Intern/Text-Generation/bbc_news_mixed.csv')

In [4]:
vocab_size = 10000
max_length = 200

In [5]:
data.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


In [6]:
data['label'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: label, dtype: int64

Tokenizing the text data

In [7]:
token = Tokenizer(num_words = 30000)

In [8]:
token.fit_on_texts(data['text'])

In [9]:
token.word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'in': 6,
 'for': 7,
 'is': 8,
 'that': 9,
 'on': 10,
 'said': 11,
 'it': 12,
 'was': 13,
 'be': 14,
 'he': 15,
 'with': 16,
 'as': 17,
 'has': 18,
 'have': 19,
 'at': 20,
 'by': 21,
 'will': 22,
 'but': 23,
 'are': 24,
 'from': 25,
 'not': 26,
 'i': 27,
 'his': 28,
 'mr': 29,
 'they': 30,
 'this': 31,
 'an': 32,
 'we': 33,
 'which': 34,
 'had': 35,
 'would': 36,
 'been': 37,
 'their': 38,
 'more': 39,
 'its': 40,
 'up': 41,
 'were': 42,
 'also': 43,
 'year': 44,
 'who': 45,
 'new': 46,
 'people': 47,
 'us': 48,
 'one': 49,
 'about': 50,
 'there': 51,
 'out': 52,
 'after': 53,
 'or': 54,
 'than': 55,
 'all': 56,
 'can': 57,
 'if': 58,
 'could': 59,
 'over': 60,
 'you': 61,
 'last': 62,
 'first': 63,
 'when': 64,
 'time': 65,
 'two': 66,
 'now': 67,
 'so': 68,
 'other': 69,
 'into': 70,
 'some': 71,
 'what': 72,
 'she': 73,
 'government': 74,
 'world': 75,
 'uk': 76,
 'years': 77,
 'no': 78,
 'them': 79,
 'against': 80,
 'best': 81,
 'o

In [10]:
seq = token.texts_to_sequences(data['text'])

In [11]:
seq

Output hidden; open in https://colab.research.google.com to view.

In [12]:
token.word_counts

OrderedDict([('cairn', 20),
             ('shares', 276),
             ('slump', 16),
             ('on', 7622),
             ('oil', 306),
             ('setback', 12),
             ('in', 17721),
             ('energy', 126),
             ('a', 18311),
             ('uk', 993),
             ('firm', 557),
             ('have', 4769),
             ('closed', 86),
             ('down', 640),
             ('18', 209),
             ('after', 1680),
             ('disappointing', 56),
             ('drilling', 7),
             ('update', 24),
             ('and', 18611),
             ('over', 1478),
             ('possible', 210),
             ('tax', 430),
             ('demands', 48),
             ('the', 52631),
             ('company', 619),
             ('said', 7254),
             ('tests', 77),
             ('had', 2579),
             ('shown', 122),
             ('no', 978),
             ('significant', 104),
             ('finds', 19),
             ('one', 1861),
             ('o

Apply padding

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_seq = pad_sequences(seq, maxlen=300, padding='post')

In [14]:
pad_seq = pad_seq.astype('float64')

In [15]:
pad_seq.shape

(2225, 300)

Data Preprocessing

In [16]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
y = encoder.fit_transform(data[['label']])

In [17]:
y = y.toarray()

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(pad_seq, y, test_size=0.2)

Creating the model

In [19]:
model = Sequential()
model.add(Embedding(input_dim = 30000, output_dim = 100, input_length = 300))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax', ))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          3000000   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 5)                 505       
                                                                 
Total params: 3080905 (11.75 MB)
Trainable params: 3080905 (11.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [22]:
model.fit(x_train, y_train, epochs = 1) # epochs = 100



<keras.src.callbacks.History at 0x7aa5b76426e0>

In [23]:
y_pred = model.predict(x_test)



In [24]:
print(y_pred)

[[0.25373295 0.26554435 0.08610506 0.35293648 0.04168121]
 [0.24387686 0.25649777 0.10258491 0.3408961  0.05614428]
 [0.24158008 0.25849825 0.09767674 0.35040292 0.051842  ]
 ...
 [0.24546018 0.26000807 0.10270435 0.33602262 0.05580484]
 [0.25373307 0.2655438  0.08610579 0.35293543 0.04168195]
 [0.25373486 0.26553264 0.0861211  0.35291424 0.04169711]]


In [25]:
model.save('tcwelstm.h5')

  saving_api.save_model(


Building the test pipeline

In [26]:
"""tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded = pad_sequences(sequences, mexlen = max_length, padding = 'post', trucating = 'post')"""

"tokenizer = Tokenizer(num_words = vocab_size)\ntokenizer.fit_on_texts(data['text'])\nsequences = tokenizer.texts_to_sequences(data['text'])\npadded = pad_sequences(sequences, mexlen = max_length, padding = 'post', trucating = 'post')"

In [27]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
import joblib

In [29]:
joblib.dump(token, 'token.pkl')

['token.pkl']

In [30]:
config={}

In [31]:
config['vocabl_size']=vocab_size

In [32]:
config['pad_length']=max_length

In [33]:
file=open('config.txt','w')

In [34]:
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [35]:
file.write(str(config))

41

Creating classificaiton_report to evaluate the model

In [36]:
y_pred

array([[0.25373295, 0.26554435, 0.08610506, 0.35293648, 0.04168121],
       [0.24387686, 0.25649777, 0.10258491, 0.3408961 , 0.05614428],
       [0.24158008, 0.25849825, 0.09767674, 0.35040292, 0.051842  ],
       ...,
       [0.24546018, 0.26000807, 0.10270435, 0.33602262, 0.05580484],
       [0.25373307, 0.2655438 , 0.08610579, 0.35293543, 0.04168195],
       [0.25373486, 0.26553264, 0.0861211 , 0.35291424, 0.04169711]],
      dtype=float32)

In [37]:
out_max = np.zeros_like(y_pred)
for i in range(y_pred.shape[0]):
  out = np.argmax(y_pred[i])
  out_max[i][out] = 1

out_max

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [39]:
pred_result = encoder.inverse_transform(out_max)

In [40]:
out_max1 = np.zeros_like(y_test)
for i in range(y_test.shape[0]):
  out = np.argmax(y_test[i])
  out_max1[i][out] = 1

out_max1

array([[0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

In [41]:
actual_result = encoder.inverse_transform(out_max1)

In [44]:
from sklearn.metrics import classification_report
print(classification_report(actual_result, pred_result))

               precision    recall  f1-score   support

     business       0.00      0.00      0.00       102
entertainment       0.00      0.00      0.00        77
     politics       0.42      0.29      0.34        90
        sport       0.25      1.00      0.39        94
         tech       0.00      0.00      0.00        82

     accuracy                           0.27       445
    macro avg       0.13      0.26      0.15       445
 weighted avg       0.14      0.27      0.15       445



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
