In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_json("document_final.json")
descriptions = data["summary"]
topics = data["topic"]

In [28]:
# Encode target labels
label_encoder = LabelEncoder()
topics_encoded = label_encoder.fit_transform(topics)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    descriptions, topics_encoded, test_size=0.2, random_state=42
)

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [29]:
# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=500)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

In [30]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report:
               precision    recall  f1-score   support

      economy       0.90      0.90      0.90      1192
    education       0.88      0.86      0.87      1227
entertainment       0.84      0.90      0.87      1255
  environment       0.92      0.88      0.90      1132
         food       0.90      0.88      0.89      1119
       health       0.94      0.92      0.93      1821
     politics       0.96      0.95      0.96      1290
       sports       0.95      0.96      0.96      1831
   technology       0.86      0.90      0.88      2145
       travel       0.95      0.94      0.94      2098

     accuracy                           0.91     15110
    macro avg       0.91      0.91      0.91     15110
 weighted avg       0.91      0.91      0.91     15110



In [37]:
# Use transform to convert new input to the same feature space
new_input = pd.core.series.Series(["university at buffalo"])
# new_input = ["political situation in nepal"]
new_input_tfidf = tfidf.transform(new_input)  

X_test_tfidf = tfidf.transform(X_test)

# Predict the topic
predicted_topic = model.predict(new_input_tfidf)

# Decode the predicted label
predicted_topic_label = label_encoder.inverse_transform(predicted_topic)
print("Predicted Topic:", predicted_topic_label[0])


Predicted Topic: technology
