<a href="https://colab.research.google.com/github/Shanto1952/AI/blob/main/News_Detect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix


df = pd.read_csv('/content/News_Dataset.csv', sep=',')

df['text'] = df['Title'] + " " + df['Description']

category_mapping = {category: idx for idx, category in enumerate(df['Category'].unique())}
df['category_label'] = df['Category'].map(category_mapping)

print("\nCategory Mapping:")
print(category_mapping)

print("\nProcessed Dataset Head:")
print(df[['Category', 'category_label', 'text']].head())

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category_label'], random_state=1, stratify=df['category_label']
)

print("\nDataset Split:")
print(f"Original dataset contains {df.shape[0]} rows")
print(f"Training dataset contains {X_train.shape[0]} rows")
print(f"Testing dataset contains {X_test.shape[0]} rows")

count_vector = CountVectorizer(stop_words='english', max_features=5000)

training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

print("\nVectorized Data:")
print(f"Training data shape: {training_data.shape}")
print(f"Testing data shape: {testing_data.shape}")

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(testing_data)

print("\nModel Evaluation:")
print(f"Accuracy score: {accuracy_score(y_test, predictions):.2f}")
print("Confusion matrix:\n", confusion_matrix(y_test, predictions))

def classify_text(text):
    prediction = naive_bayes.predict(count_vector.transform([text]))
    return list(category_mapping.keys())[list(category_mapping.values()).index(prediction[0])]

print("\nExample Predictions:")
news= [
    "Bangladesh Calls for Fair Climate Finance at COP29",
    "UN Report Highlights Bangladesh's Vulnerability to Rising Methane Emissions",
    "Argentina-Brazil are entering the field in a different match in World Cup qualifiers",
    "Failed HSC candidates lock Mymensingh Education Board gates with officials inside",
    "Global Climate Initiatives Must Address Bangladesh’s Flood Risks",
    "Bangladesh Football Makes Strides Toward International Success",
    "New Policies to Address Corruption Announced by the Government",
    "BD Sports Weekly: Highlights and Upcoming Matches",
    "Bangladesh Prepares for Regional Leadership Role in Climate Adaptation",
    "Debates Continue Over Bangladesh's Support for UN Peacekeeping Missions"
]


for i, case in enumerate(news, start=1):
    print(f"{i}: Category: {classify_text(case)}")




Category Mapping:
{'the game': 0, 'political': 1, 'education': 2}

Processed Dataset Head:
    Category  category_label  \
0   the game               0   
1   the game               0   
2  political               1   
3   the game               0   
4  political               1   

                                                text  
0  Pakistan's star all-rounder will score 22 yard...  
1  Sports village will be in Dhaka with 55 federa...  
2  People will not accept inefficiency in governm...  
3  Sohan wants to use the experience of global T2...  
4  'Bangladesh deprived of fair share of water of...  

Dataset Split:
Original dataset contains 89 rows
Training dataset contains 66 rows
Testing dataset contains 23 rows

Vectorized Data:
Training data shape: (66, 1374)
Testing data shape: (23, 1374)

Model Evaluation:
Accuracy score: 0.91
Confusion matrix:
 [[ 8  0  0]
 [ 2 11  0]
 [ 0  0  2]]

Example Predictions:
1: Category: political
2: Category: political
3: Category: the game
4