#### --------------------------------------------------------------------------------------------
### Description of the project:

##### As the name of the project is news category classification.

##### Here we predict the category of the news on the basis of news content.

#### ---------------------------------------------------------------------------------------------

In [1]:
#Now here we import the important libraries.

#Linear algebra.
import numpy as np

#Data preprocessing(read .csv and other format file).
import pandas as pd

#Data Visualization.
import matplotlib.pyplot as plt

#Statistic Visualization.
import seaborn as sns

#word embedding.
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#Nlp task.
import nltk

#Regular expression.
import re

#Stopwords of nltk.
from nltk.corpus import stopwords

#Wordnetlemmatizer.
from nltk.stem import WordNetLemmatizer

#Punctuation marks of string.
from string import punctuation

#Standard scaler.
from sklearn.preprocessing import StandardScaler

#Multinomial naive bayes.
from sklearn.naive_bayes import MultinomialNB

#Model statistics.
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

#Data saving and loading.
import pickle

#Json.
import json

#Warning handler.
import warnings

#Ignore warnings.
warnings.filterwarnings("ignore")

In [2]:
#Now here we load the dataset.

df=pd.read_csv(r"english_news_dataset.csv")

In [3]:
#Now here we check the top records of dataset.

df.head()

Unnamed: 0,Headline,Content,News Categories,Date
0,Congress leader Baljinder Singh shot dead at h...,Congress leader Baljinder Singh was shot dead ...,['national'],19-09-2023
1,17-year-old girl preparing for NEET dies by su...,Another NEET aspirant died by suicide in Rajas...,['national'],19-09-2023
2,Hampers to welcome MPs in new Parliament tomor...,In order to mark the first-ever working day of...,['national'],19-09-2023
3,"Only 10% women lawmakers in RS, while only 14%...","Congress President Mallikarjun Kharge, while s...",['national'],19-09-2023
4,"Ganesh temple decorated with notes, coins wort...",The Sri Sathya Ganapathi Temple in Bengaluru a...,['national'],19-09-2023


In [4]:
#Now here we check the shape of dataset.

df.shape

(199706, 4)

In [5]:
#Now here we check the number of null values present in the dataset.

df.isnull().sum()

Headline           0
Content            0
News Categories    0
Date               0
dtype: int64

In [6]:
#Now here we check some values of Headline and content column.

print(df["Headline"][0])
print()
print(df["Content"][0])

Congress leader Baljinder Singh shot dead at home in Punjab's Moga

Congress leader Baljinder Singh was shot dead at his house in Punjab's Moga on Monday, a video of which has also surfaced online. According to a report by the Free Press Journal, Singh received a call from someone regarding the signing of documents. In the video, the accused can be seen shooting at Singh while he is walking towards him. 


In [7]:
#Now here we check some more values of Headline column.

for value in df["Headline"][:5]:
    print(value)

Congress leader Baljinder Singh shot dead at home in Punjab's Moga
17-year-old girl preparing for NEET dies by suicide in Rajasthan's Kota
Hampers to welcome MPs in new Parliament tomorrow, pics surface
Only 10% women lawmakers in RS, while only 14% in LS: INC's Kharge
Ganesh temple decorated with notes, coins worth ₹2.5 crore in Bengaluru


In [8]:
#Now here we check the News Categories columns corresponding to above Headlines.

for value in df["News Categories"][:5]:
    print(value)

['national']
['national']
['national']
['national']
['national']


In [9]:
#Now here we check the column names of dataset.

df.columns

Index(['Headline', 'Content', 'News Categories', 'Date'], dtype='object')

In [10]:
#Here we remove the unuseful or less useful columns.

df.drop(["Content","Date"],axis=1,inplace=True)

In [11]:
#Now here we again check the top records of dataset.

df.head()

Unnamed: 0,Headline,News Categories
0,Congress leader Baljinder Singh shot dead at h...,['national']
1,17-year-old girl preparing for NEET dies by su...,['national']
2,Hampers to welcome MPs in new Parliament tomor...,['national']
3,"Only 10% women lawmakers in RS, while only 14%...",['national']
4,"Ganesh temple decorated with notes, coins wort...",['national']


In [12]:
#Now here we deals with Headline column.

#Here we create the object of tfidf vectorizer.
tfidf=TfidfVectorizer(stop_words="english")

#Here we convert or do the word embedding of Headline column.
x=tfidf.fit_transform(df["Headline"])

In [13]:
#Now here we check the shape of x.

x.shape

(199706, 26189)

In [14]:
#Now here we make the dependent column.

y=df["News Categories"]

In [15]:
#Now here we split the dataset into train and test data.

#Here we import the train test split library.
from sklearn.model_selection import train_test_split

#Here we split the dataset in the ratio of 80-20%.
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [16]:
#Now here we check the shape of x_train/test and y_train/test.

print(f"The shape of x_train is: {x_train.shape}.")

print(f"The shape of x_test is: {x_test.shape}.")

print(f"The shape of y_train is: {y_train.shape}.")

print(f"The shape of y_test is: {y_test.shape}.")

The shape of x_train is: (159764, 26189).
The shape of x_test is: (39942, 26189).
The shape of y_train is: (159764,).
The shape of y_test is: (39942,).


In [17]:
#now here we scaled the data.

#Here we create the object of standard scaler.
# sc=StandardScaler()

#Here we scaled the x_train data.
# x_train_sc=sc.fit_transform(x_train)

#Here we scaled the x_test data.
# x_test_sc=sc.transform(x_test)

In [18]:
#Now here we apply multinomial naive bayes classifier.

#Here we create the object of multinomial naive bayes classifier.
mnc=MultinomialNB()

#Here we train the mnc model.
mnc.fit(x_train,y_train)

#Here we predict the values using mnc.
pred_mnc=mnc.predict(x_test)

#Here we find the accuracy of mnc model.
acc_mnc=accuracy_score(y_test,pred_mnc)
print(f"The accuracy of mnc model is: {acc_mnc}.")

#Here we find the bias of mnc model.
bias_mnc=mnc.score(x_train,y_train)
print(f"The bias of mnc model is: {bias_mnc}.")

#Here we find the variance of mnc model.
variance_mnc=mnc.score(x_test,y_test)
print(f"The variance of mnc model is: {variance_mnc}.")

The accuracy of mnc model is: 0.7636072304842021.
The bias of mnc model is: 0.7899964948298741.
The variance of mnc model is: 0.7636072304842021.


In [19]:
#Now here we apply logistic regression.

# #Here we create the object of logistic regression.
# from sklearn.linear_model import LogisticRegression
# lor=LogisticRegression()

# #Here we train the lor model.
# lor.fit(x_train,y_train)

#Here we predict the values using lor.
# pred_lor=lor.predict(x_test)

#Here we find the accuracy of lor model.
# acc_lor=accuracy_score(y_test,pred_lor)
# print(f"The accuracy of lor model is: {acc_lor}.")

#Here we find the bias of lor model.
# bias_lor=lor.score(x_train,y_train)
# print(f"The bias of lor model is: {bias_lor}.")

#Here we find the variance of lor model.
# variance_lor=lor.score(x_test,y_test)
# print(f"The variance of lor model is: {variance_lor}.")

In [20]:
#Now here we test one of the input.

df.columns

Index(['Headline', 'News Categories'], dtype='object')

In [21]:
#Here we take one of the input.

print(df["Headline"][14])
print()
print(df["News Categories"][14])

Zareen Khan's lawyer issues statement on arrest warrant, says 'she was victim of fraud'

['entertainment', 'national']


In [22]:
#Here we take one input text.

input_text="Zareen Khan's lawyer issues statement on arrest warrant, says 'she was victim of fraud'"

#Now here we do the embedding.
embed_text=tfidf.transform([input_text])

#Now here we do the prediction.
prediction=mnc.predict(embed_text)[0]

print(f"The prediction is: {prediction}.")

The prediction is: ['politics', 'national'].


In [23]:
#Now here we save the model and vectorizer.

pickle.dump(mnc,open("model.pkl","wb"))

pickle.dump(tfidf,open("vectorizer.pkl","wb"))