**Import necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Loading Data: Loads the dataset from the provided URL using Pandas.**

In [None]:
url = "https://drive.google.com/uc?id=15hV7eK6HX3kJCIW6uvgl32xyMmygzyEU"
data = pd.read_csv(url)

**Exploratory Data Analysis (EDA): Displays the first few rows of the dataset and the distribution of news articles across different categories.**


Display the first few rows of data.

In [None]:
print(data.head())

                                            Headline  \
0  Congress leader Baljinder Singh shot dead at h...   
1  17-year-old girl preparing for NEET dies by su...   
2  Hampers to welcome MPs in new Parliament tomor...   
3  Only 10% women lawmakers in RS, while only 14%...   
4  Ganesh temple decorated with notes, coins wort...   

                                             Content News Categories  \
0  Congress leader Baljinder Singh was shot dead ...    ['national']   
1  Another NEET aspirant died by suicide in Rajas...    ['national']   
2  In order to mark the first-ever working day of...    ['national']   
3  Congress President Mallikarjun Kharge, while s...    ['national']   
4  The Sri Sathya Ganapathi Temple in Bengaluru a...    ['national']   

         Date  
0  19-09-2023  
1  19-09-2023  
2  19-09-2023  
3  19-09-2023  
4  19-09-2023  


Display column names.

In [None]:
print(data.columns)

Index(['Headline', 'Content', 'News Categories', 'Date'], dtype='object')


Unique news categories.

In [None]:
data['News Categories'].unique()

array(["['national']", "['entertainment', 'national']",
       "['politics', 'national']", "['world', 'national']",
       "['national', 'technology']", "['business', 'national']",
       "['sports', 'national']",
       "['world', 'national', 'Health___Fitness']",
       "['national', 'Health___Fitness']", "['business', 'technology']",
       "['business']", "['business', 'startup']",
       "['automobile', 'business', 'technology']",
       "['business', 'fashion']", "['world', 'business']",
       "['world', 'business', 'technology']",
       "['automobile', 'business']",
       "['business', 'entertainment', 'national']",
       "['world', 'business', 'national']",
       "['business', 'science', 'technology']",
       "['cryptocurrency', 'business', 'technology']",
       "['automobile', 'business', 'national']", "['politics']",
       "['politics', 'sports', 'Asia_Cup_2023']",
       "['politics', 'entertainment']", "['sports']",
       "['sports', 'entertainment']", "['sports', 

Count occurrences of each news category.

In [None]:
category_counts = data['News Categories'].explode().value_counts()

Print distribution of news articles by category.

In [None]:
print("\nDistribution of news articles across different categories:")
for category, count in category_counts.items():
    print(f"{category}: {count}")


Distribution of news articles across different categories:
['entertainment']: 11021
['miscellaneous']: 10545
['science']: 8901
['national']: 7552
['politics']: 7473
['entertainment', 'fashion']: 7042
['politics', 'national']: 6977
['business']: 6967
['sports']: 6939
['automobile', 'business']: 6913
['business', 'startup', 'technology']: 6724
['education', 'national']: 6095
['world']: 6019
['technology']: 5043
['startup']: 4650
['national', 'travel']: 4551
['Health___Fitness']: 4040
['education']: 3961
['business', 'technology']: 3710
['science', 'Health___Fitness']: 3574
['world', 'national']: 3384
['fashion']: 3187
['business', 'startup']: 3166
['sports', 'ODI_World_Cup_2023']: 2908
['travel']: 2905
['national', 'Health___Fitness']: 2369
['world', 'Israel-Hamas_War']: 2326
['startup', 'technology']: 2077
['automobile', 'technology']: 1852
['world', 'science']: 1360
['world', 'travel']: 1354
['business', 'national']: 1267
['world', 'Health___Fitness']: 1254
['sports', 'Asian_Games_202

**Data Preprocessing: Defines a preprocessing function to clean the text data (lowercasing, removing HTML tags, special characters, stopwords, etc.), and applies it to the 'text' column.**

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters, punctuation, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join tokens back into text
    processed_text = ' '.join(tokens)

    return processed_text

Apply preprocessing to the correct column containing text data

In [None]:
data['clean_text'] = data['Content'].apply(preprocess_text)

**Splitting Data: Splits the preprocessed data into training and testing sets using 'train_test_split()'.**


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['News Categories'], test_size=0.2, random_state=42)

**Feature Extraction: Converts the text data into TF-IDF features using 'TfidfVectorizer()'.**

In [None]:
vectorizer = TfidfVectorizer(max_features=5000) # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Model Training: Initializes a Multinomial Naive Bayes classifier and trains it on the TF-IDF features**


In [None]:
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)


**Predictions on the testing set**

In [None]:
y_pred = classifier.predict(X_test_tfidf)

**Model Evaluation: Predicts the categories for the testing set, calculates accuracy, and displays a classification report showing precision, recall, and F1-score for each category.**


Compute and print accuracy.

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


Accuracy: 0.7679635471433579


Print classification report.

In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                                                                                                                             precision    recall  f1-score   support

                                                                                                                                                ['Asia_Cup_2023', 'sports']       0.00      0.00      0.00         8
                                                                                                                                        ['Coronavirus', 'Health___Fitness']       0.00      0.00      0.00        11
                                                                                                                            ['Coronavirus', 'national', 'Health___Fitness']       1.00      0.30      0.46        30
                                                                                                                                      ['Coronavirus