In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import numpy as np # Import numpy


df = pd.read_excel("/content/Corrected_27k_News_Categories.xlsx")

df['text'] = df['Story Excerpt'] + " " + df['Story Heading']
df = df[['text', 'Category']].dropna()
df['Category'] = df['Category'].apply(lambda x: [cat.strip() for cat in x.split(',')])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['Category'])
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english',
    sublinear_tf=True
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = OneVsRestClassifier(LogisticRegression(max_iter=1000,class_weight="balanced"))
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)


def predict_categories(heading, excerpt, threshold=0.4):
    text = heading + ". " + excerpt
    vec = vectorizer.transform([text])
    proba = model.predict_proba(vec)[0]
    classes = mlb.classes_

    for i in range(len(classes)):
        if proba[i] >= threshold:
            print(f"{classes[i]}: {proba[i]:.2f}")

    return [classes[i] for i in range(len(classes)) if proba[i] >= threshold]


headline = "Imran’s sons meet Trump aide to kick off US campaign to free father"
excerpt = "Incarcerated PTI Founder Imran Khan’s sons met with United States President Donald Trump’s key aide Richard Grenell on Tuesday as they kicked off a campaign calling for their father’s release from prison."
categories = predict_categories(headline, excerpt,threshold=0.5)
print("Predicted Categories:", categories)



Politics: 0.94
Technology: 0.53
Predicted Categories: ['Politics', 'Technology']


In [13]:
import joblib

joblib.dump(model, "model1.pkl")
joblib.dump(vectorizer, "vectorizer1.pkl")
joblib.dump(mlb, "mlb1.pkl")


['mlb1.pkl']

In [14]:
from google.colab import files

files.download("model.pkl")
files.download("vectorizer.pkl")
files.download("mlb.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
from joblib import dump
dump(model, 'model.pkl')
dump(vectorizer, 'vectorizer.pkl')
dump(mlb, 'mlb.pkl')

from google.colab import files
files.download('model1.pkl')
files.download('vectorizer1.pkl')
files.download('mlb1.pkl')


FileNotFoundError: Cannot find file: model1.pkl