<a href="https://colab.research.google.com/github/Ridzzz0Alam/Data_Science/blob/main/NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("train.csv")
print("Shape:", df.shape)
df.head()


Shape: (120000, 3)


Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [3]:
df["text"] = df["Title"].fillna("") + " " + df["Description"].fillna("")
df["label"] = df["Class Index"] - 1

print("Classes:", df["label"].nunique())
df[["text", "label"]].head()

Classes: 4


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_clean"] = df["text"].apply(clean_text)
df[["text_clean", "label"]].head()

Unnamed: 0,text_clean,label
0,wall st bears claw back into the black reuters...,2
1,carlyle looks toward commercial aerospace reut...,2
2,oil and economy cloud stocks outlook reuters r...,2
3,iraq halts oil exports from main southern pipe...,2
4,oil prices soar to all time record posing new ...,2


In [5]:
from sklearn.model_selection import train_test_split

X = df["text_clean"].values
y = df["label"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp,test_size=0.5,stratify=y_temp,random_state=42)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)
X_test_tfidf  = vectorizer.transform(X_test)

print(X_train_tfidf.shape)

(96000, 50000)


In [7]:
from sklearn.decomposition import TruncatedSVD

svd=TruncatedSVD(n_components=300, random_state=42)
X_train_dense=svd.fit_transform(X_train_tfidf)
X_val_dense=svd.transform(X_val_tfidf)
X_test_dense=svd.transform(X_test_tfidf)

print(X_train_dense.shape)

(96000, 300)


In [8]:
import tensorflow as tf
from tensorflow.keras import layers, models

tf.random.set_seed(42)

model = models.Sequential([layers.Input(shape=(X_train_dense.shape[1],)),layers.Dense(128, activation="relu"),layers.Dropout(0.3),layers.Dense(64, activation="relu"),layers.Dropout(0.3),layers.Dense(len(np.unique(y_train)), activation="softmax")])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

model.summary()


In [9]:
model.fit(X_train_dense, y_train,validation_data=(X_val_dense, y_val),epochs=10,batch_size=256,verbose=1)

Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.6606 - loss: 0.9540 - val_accuracy: 0.8933 - val_loss: 0.3263
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8816 - loss: 0.3589 - val_accuracy: 0.8949 - val_loss: 0.3080
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8882 - loss: 0.3348 - val_accuracy: 0.8978 - val_loss: 0.2994
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8925 - loss: 0.3190 - val_accuracy: 0.8993 - val_loss: 0.2935
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8959 - loss: 0.3100 - val_accuracy: 0.9009 - val_loss: 0.2880
Epoch 6/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.8976 - loss: 0.3009 - val_accuracy: 0.9016 - val_loss: 0.2845
Epoch 7/10
[1m375/375[0

<keras.src.callbacks.history.History at 0x7dfd2aef4920>

In [10]:
loss, acc = model.evaluate(X_test_dense, y_test, verbose=0)
print("Test accuracy:", acc)

Test accuracy: 0.9043333530426025


In [11]:
from sklearn.metrics import classification_report

y_pred = np.argmax(model.predict(X_test_dense), axis=1)
print(classification_report(y_test, y_pred))


[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.93      0.88      0.90      3000
           1       0.94      0.97      0.96      3000
           2       0.87      0.88      0.87      3000
           3       0.88      0.88      0.88      3000

    accuracy                           0.90     12000
   macro avg       0.90      0.90      0.90     12000
weighted avg       0.90      0.90      0.90     12000

