## Importing Libraries and reading dataset

In [1]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("news10.csv")
print(df.head())

  category                                           headline
0  FINANCE  U.S. Launches Auto Import Probe, China Vows To...
1  FINANCE  Starbucks Says Anyone Can Now Sit In Its Cafes...
2  FINANCE  Seattle Passes Controversial New Tax On City's...
3  FINANCE  Uber Ends Forced Arbitration In Individual Cas...
4  FINANCE  Chili's Hit By Data Breach, Credit And Debit C...


In [38]:
df['category'].value_counts()


EDUCATION     4760
FINANCE       4254
SPORTS        4167
Healthcare    3756
POLITICS      2905
category         1
Name: category, dtype: int64

In [7]:
data=df['headline']
target=df['category']

In [8]:
len(data)

19843

## Train test split for splitting data

In [9]:
from sklearn.model_selection import train_test_split
#for splitting our data into train and test split
xtrain,xtest,ytrain,ytest=train_test_split(data,target,test_size=0.2,random_state=42)
# train  80 % test 20 % 

In [10]:
xtrain.shape


(15874,)

In [11]:
len(ytrain)

15874

## converting words into Tfidf vectors

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Convert a collection of raw documents to a matrix of TF-IDF features.
#TfidfVectorizer is a bag of words approach.
vectorizer =TfidfVectorizer(stop_words='english',ngram_range=(1,2))
#stop_words to remove less meaningingful words in english language
#n_gram for outputting one word and two word 
xtrain = vectorizer.fit_transform(xtrain.values.astype('U'))
xtest = vectorizer.transform(xtest.values.astype('U'))

#Term Frequency(TF) and Inverse Document Frequency(IDF). The term frequency indicates the frequency of each of the words present in the document or dataset. 
#The second part is — inverse document frequency. IDF actually tells us how important the word is to the document.

In [13]:
xtrain.shape

(15874, 84664)

In [14]:
xtest.shape

(3969, 84664)

In [15]:
ytrain.shape

(15874,)

In [48]:
ytrain

13453        SPORTS
12561        SPORTS
11335     EDUCATION
6074      EDUCATION
7199     Healthcare
            ...    
11284     EDUCATION
11964     EDUCATION
5390      EDUCATION
860         FINANCE
15795        SPORTS
Name: category, Length: 15874, dtype: object

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
#The multinomial Naive Bayes(mnb) classifier is suitable for classification with discrete features.

In [17]:
mnb.fit(xtrain,ytrain)
prediction = mnb.predict(xtest)
print(prediction)

['SPORTS' 'EDUCATION' 'FINANCE' ... 'EDUCATION' 'Healthcare' 'Healthcare']


## checking accuracy

In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, prediction)
#A confusion matrix is a table that is often used to describe the performance 
#of a classification model on a set of test data for which the true values are known.


array([[764,  95,  36,   6,  45,   0],
       [141, 607,  49,   9,  28,   0],
       [145,  61, 519,   8,  30,   0],
       [142,  86,  16, 290,  40,   0],
       [ 52,  14,  11,   4, 770,   0],
       [  1,   0,   0,   0,   0,   0]], dtype=int64)

In [19]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(ytest,prediction)
print(acc)

0.7432602670697909


In [23]:
new4=['rcb won the match']
new4=vectorizer.transform(new4)
p=mnb.predict(new4.todense())
print(p)

['SPORTS']
