In [80]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
import argparse
import os
import tensorflow as tf
import pandas as pd

In [81]:
df = pd.read_csv('newsCorpora.csv', sep ='\t', names = ['ID','Title','URL', 'Publisher', 'Category', 'Story', 'Hostname','Timestamp'])
df=df[['Title','Category']]
dict = {'b':'Business', 't':'Science', 'e':'Entertainment', 'm':'Health'}

In [82]:
def update_category(x):
    return dict[x]
df['Category'] = df['Category'].apply(lambda x: update_category(x))

In [83]:
df

Unnamed: 0,Title,Category
0,"Fed official says weak data caused by weather,...",Business
1,Fed's Charles Plosser sees high bar for change...,Business
2,US open: Stocks fall after Fed official hints ...,Business
3,"Fed risks falling 'behind the curve', Charles ...",Business
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,Business
...,...,...
422414,Surgeons to remove 4-year-old's rib to rebuild...,Health
422415,Boy to have surgery on esophagus after battery...,Health
422416,Child who swallowed battery to have reconstruc...,Health
422417,Phoenix boy undergoes surgery to repair throat...,Health


In [84]:
df = df.sample(frac=0.10,random_state=1)
df = df.reset_index(drop=True)

In [86]:
encode_dict = {}

def encode_category(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

In [87]:
df['Encode Category'] = df['Category'].apply(lambda x:encode_category(x))

In [89]:
df['Encode Category'].value_counts()

1    15275
0    11438
2    10963
3     4566
Name: Encode Category, dtype: int64

In [90]:
print(encode_dict)

{'Business': 0, 'Entertainment': 1, 'Science': 2, 'Health': 3}


In [91]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df['Title'],df['Encode Category'],stratify=df['Encode Category'])

In [92]:
Y_train.value_counts()

1    11456
0     8578
2     8222
3     3425
Name: Encode Category, dtype: int64

In [93]:
Y_test.value_counts()

1    3819
0    2860
2    2741
3    1141
Name: Encode Category, dtype: int64

In [94]:
import tensorflow as tf
print(tf.__version__)

2.13.0


In [95]:
X_train[20]

'New York second in nation for chikungunya'

In [96]:
cnt_=[]
for i in X_train:
    cnt_.append(len(i.split()))
print(X_train[max(cnt_)],max(cnt_))    
    

New York second in nation for chikungunya 20


In [97]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define tokenizer (limiting vocab size if needed)
MAX_VOCAB_SIZE = 20000  # Adjust based on dataset size
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")

In [98]:
# Fit tokenizer on the text data
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [99]:
# Determine max sequence length
MAX_SEQ_LENGTH = max(len(seq) for seq in X_train_seq)  # or set a fixed value

In [100]:
# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [101]:
from tensorflow.keras.utils import to_categorical

# One-hot encode labels
Y_train_cat = to_categorical(Y_train, num_classes=4)
Y_test_cat = to_categorical(Y_test, num_classes=4)


In [102]:
X_train_pad

array([[ 896, 1978,  381, ...,    0,    0,    0],
       [  47,  635,  837, ...,    0,    0,    0],
       [  72,   13,  606, ...,    0,    0,    0],
       ...,
       [ 648, 3226,  964, ...,    0,    0,    0],
       [  32,   41,  549, ...,    0,    0,    0],
       [ 164,  854, 1100, ...,    0,    0,    0]])

In [103]:
Y_train_cat

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [104]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 100  # Adjust based on complexity

model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim, input_length=MAX_SEQ_LENGTH),
    LSTM(64),
    Dense(4, activation='softmax')  # 4 output classes
])


In [105]:
model.compile(
    loss='categorical_crossentropy',  # Entropy-based loss function
    optimizer='adam',
    metrics=['accuracy']
)


In [106]:
# Train the model
history = model.fit(
    X_train_pad, Y_train_cat,  # Tokenized and padded input & one-hot labels
    validation_data=(X_test_pad, Y_test_cat),  # Validation set
    epochs=10,  # Number of iterations (adjustable)
    batch_size=32,  # Number of samples per batch
    verbose=1  # Display training progress
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [107]:
loss, accuracy = model.evaluate(X_test_pad, Y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.8942


In [108]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to preprocess and predict
def predict_class(sentence, tokenizer, model, max_length):
    # Tokenize the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    
    # Pad the sequence to match training input length
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    # Get prediction probabilities
    prediction = model.predict(padded_sequence)
    
    # Get class with highest probability
    predicted_class = np.argmax(prediction)
    
    return predicted_class, sequence


In [114]:
sentence = input("Type your sentence within word count 19")
predict_class(sentence,tokenizer,model,19)



(0, [[699, 1101, 36, 1]])

In [113]:
encode_dict.keys()

dict_keys(['Business', 'Entertainment', 'Science', 'Health'])

In [10]:
encode_dict = {'Business': 0, 'Entertainment': 1, 'Science': 2, 'Health': 3}
output = encode_dict['Business']
output

0