<a href="https://colab.research.google.com/github/Rajfekar/PythonML/blob/main/NewsCategoryPredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')

data = [
    {"title": "Stock Market Hits Record High", "category": "business"},
    {"title": "Oscars 2024: Best Movies Announced", "category": "entertainment"},
    {"title": "COVID-19 Vaccine Shows 95% Effectiveness", "category": "health"},
    {"title": "NASA Launches New Space Telescope", "category": "science"},
    {"title": "Premier League: Manchester United Wins", "category": "sports"},
    {"title": "Tesla Unveils New AI-Driven Electric Car", "category": "technology"},
]


df = pd.DataFrame(data)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:

df.to_csv("news_data.csv", index=False)
print("✅ news_data.csv file saved!")


✅ news_data.csv file saved!


In [36]:

df = pd.read_csv("news.csv", encoding='latin-1', on_bad_lines='skip', sep=',')
df

df['title'] = df['summary']
df['category']  = df['subject']
df = df.drop(columns=['summary', 'subject'])
df.dropna(subset=['title', 'category'], inplace=True)
df


Unnamed: 0,title,category
0,The news article discusses the milestone event...,Science and Technology
1,The Election Commission of India (ECI) has adj...,Polity and Governance
2,"\nAn acute drinking water crisis in Bengaluru,...",Environment and Ecology
3,"The State of the Climate report for 2023, publ...",Environment and Ecology
4,The gender pay gap is a persistent issue globa...,Economic and Social Development
...,...,...
2429,The Supreme Court of India has issued a ruling...,Polity and Governance
2430,The 45-day Maha Kumbh festival concluded on Fe...,Environment and Ecology
2431,"In January, around 120 SpaceX Starlink satelli...",Environment and Ecology
2432,The article discusses the concept of carbon in...,Environment and Ecology


In [37]:

df['text'] = df['title'].str.lower()




In [43]:
df['text']  = df['text'].str.replace('\n',' ')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)



In [45]:

vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
def predict_category(news_title):
    news_title = news_title.lower()  # Convert to lowercase
    news_tfidf = vectorizer.transform([news_title])  # Convert text to TF-IDF
    prediction = model.predict(news_tfidf)[0]  # Predict category
    return prediction


In [46]:
X_test

Unnamed: 0,text
438,kenyan president william ruto announced his r...
2121,recent discussions within the ai community sug...
611,the department of promotion of industry and in...
1104,the indian space research organisation (isro) ...
2111,china has made significant strides in its lith...
...,...
1561,the air quality index (aqi) in new delhi reach...
742,the survey conducted by kearney and amazon pa...
184,india's jan aushadhi scheme: nepal seeks part...
1394,the union cabinet of india approved the develo...


In [47]:
y_test

Unnamed: 0,category
438,Polity and Governance
2121,Science and Technology
611,Economic and Social Development
1104,Science and Technology
2111,Economic and Social Development
...,...
1561,Environment and Ecology
742,Economic and Social Development
184,Economic and Social Development
1394,Polity and Governance


In [50]:


# Test with an example
new_title = X_test[742]  # Select a test sample
predicted_category = predict_category(new_title)

print(f"🔹 Predicted Category: {predicted_category}")
print("\n🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

🔹 Predicted Category: Economic and Social Development

🔹 Accuracy: 0.8583162217659137

🔹 Classification Report:
                                  precision    recall  f1-score   support

Economic and Social Development       0.80      0.90      0.84       147
        Environment and Ecology       0.85      0.83      0.84        96
          Polity and Governance       0.90      0.82      0.86       149
         Science and Technology       0.91      0.88      0.90        95

                       accuracy                           0.86       487
                      macro avg       0.87      0.86      0.86       487
                   weighted avg       0.86      0.86      0.86       487



In [26]:


X_test

Unnamed: 0,text
17484,"instagram, facebook users to get more choices ..."
16904,eu begins to hash out eu ai act details
7256,lokpal orders cbi probe against mahua moitra
4056,doctors' strike: health ministry unlikely to s...
7841,hasina-modi talks today: defence to connectivi...
...,...
9587,how horses galloped into human history: what a...
8249,delhi hc asks govt to pay shaurya chakra award...
9424,what is the new flirt variant of the covid vir...
3326,lok sabha election 2024 phase 3 voting live up...


In [27]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 3923 entries, 17484 to 7978
Series name: category
Non-Null Count  Dtype 
--------------  ----- 
3923 non-null   object
dtypes: object(1)
memory usage: 61.3+ KB


In [28]:
y_test

Unnamed: 0,category
17484,sci-tech
16904,sci-tech
7256,india
4056,india
7841,india
...,...
9587,explained
8249,india
9424,explained
3326,india
