In [15]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
df=pd.read_csv("test.csv")
df['text']=df['Title'].astype(str)+" "+df['Description'].astype(str)
df=df[['Class Index','text']]
df.head()

Unnamed: 0,Class Index,text
0,3,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...


In [16]:
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tensorflow.keras.preprocessing.text import text_to_word_sequence
stop_words=ENGLISH_STOP_WORDS
def prep(text):
    text=text.lower()
    text=re.sub(r'http\S+|www\S+','',text)
    text=re.sub(r'<.*?>','',text)
    text=re.sub(r'\d+','',text)
    text=text.translate(str.maketrans('','',string.punctuation))
    words=text_to_word_sequence(text)
    words=[w for w in words if w not in stop_words]
    return " ".join(words)
df['prep_text']=df['text'].apply(prep)
df.head()

Unnamed: 0,Class Index,text,prep_text
0,3,Fears for T N pension after talks Unions repre...,fears t n pension talks unions representing wo...
1,4,The Race is On: Second Private Team Sets Launc...,race second private team sets launch date huma...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...,ky company wins grant study peptides ap ap com...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...,prediction unit helps forecast wildfires ap ap...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,calif aims limit farmrelated smog ap ap southe...


In [23]:
le=LabelEncoder()
df['label']=le.fit_transform(df['Class Index'])
df[['Class Index','label']].head()
X=df['prep_text']
y=df['label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=11,stratify=y)

### **The code tokenizes the preprocessed text and pads the sequences to a uniform length for neural network input.**

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer(num_words=15000)
tokenizer.fit_on_texts(df['prep_text'])
X_train_seq= tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)
X_train_pad=pad_sequences(X_train_seq,maxlen=150)
X_test_pad=pad_sequences(X_test_seq,maxlen=150)

### **The code creates and trains a GRU neural network model with an embedding layer for text classification.**


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense,Dropout
model=Sequential()
model.add(Embedding(input_dim=15000,output_dim=64,input_length=100))
model.add(GRU(32))
model.add(Dropout(0.3))
model.add(Dense(4, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X_train_pad,y_train,validation_split=0.1,epochs=5,batch_size=64)

Epoch 1/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - accuracy: 0.4475 - loss: 1.3497 - val_accuracy: 0.5675 - val_loss: 1.2206
Epoch 2/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.6944 - loss: 0.8986 - val_accuracy: 0.7525 - val_loss: 0.7855
Epoch 3/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.8919 - loss: 0.3698 - val_accuracy: 0.8350 - val_loss: 0.4530
Epoch 4/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9675 - loss: 0.1226 - val_accuracy: 0.8400 - val_loss: 0.4870
Epoch 5/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9919 - loss: 0.0415 - val_accuracy: 0.8400 - val_loss: 0.5423


In [28]:
import numpy as np
test_text=[
    ["Serena Williams announces retirement after US Open final victory ceremony."],
    ["Diplomatic talks resume between North and South Korea at border village."],
    ["NBA playoffs: Golden State Warriors advance to finals after game seven win."],
    ["Tesla announces new factory in Texas creating over 10,000 manufacturing jobs."],
    ["New smartphone features foldable screen and advanced camera AI technology."],
    ["Bitcoin price volatility continues as cryptocurrency regulations tighten globally."]
    ]
for i,test in enumerate(test_text):
    seq=tokenizer.texts_to_sequences(test)
    padded=pad_sequences(seq,maxlen=150)
    pred=model.predict(padded)[0]
    pred_label=np.argmax(pred)
    print(f"\nModel {i+1} (LSTM)")
    print("Text:", test_text[i])
    print("Predicted news type label:",pred_label)
    print("Probabilities:",pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step

Model 1 (LSTM)
Text: ['Serena Williams announces retirement after US Open final victory ceremony.']
Predicted news type label: 2
Probabilities: [0.08759769 0.13308918 0.6177881  0.16152509]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Model 2 (LSTM)
Text: ['Diplomatic talks resume between North and South Korea at border village.']
Predicted news type label: 0
Probabilities: [9.9587601e-01 4.2323756e-04 2.8201432e-03 8.8059960e-04]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

Model 3 (LSTM)
Text: ['NBA playoffs: Golden State Warriors advance to finals after game seven win.']
Predicted news type label: 1
Probabilities: [0.02411603 0.9509692  0.0020324  0.0228824 ]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

Model 4 (LSTM)
Text: ['Tesla announces new factory in Texas creating over 10,000 manufacturing jobs.']
Predicted news type labe