In [1]:
# 1. import required libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score,classification_report


In [2]:
#3. load 20 news group train subset

train_df = pd.read_csv("nlp_train.csv")
train_df.head()

	

Unnamed: 0,text,target,category
0,I was wondering if anyone out there could enli...,7,rec.autos
1,A fair number of brave souls who upgraded thei...,4,comp.sys.mac.hardware
2,"well folks, my mac plus finally gave up the gh...",4,comp.sys.mac.hardware
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space


In [3]:
# 4. load 20 news group test subset

test_df = pd.read_csv("nlp_test.csv")

test_df.head()


Unnamed: 0,text,target,category
0,I am a little confused on all of the models of...,7,rec.autos
1,I'm not familiar at all with the format of the...,5,comp.windows.x
2,"\nIn a word, yes.\n",0,alt.atheism
3,\nThey were attacking the Iraqis to drive them...,17,talk.politics.mideast
4,\nI've just spent two solid months arguing tha...,19,talk.religion.misc


In [4]:
# 5. Print all target labels

print("Target labels train df: ", train_df["category"].unique())
print("\n\nTarget labels test df: ", test_df["category"].unique())

Target labels train df:  ['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Target labels test df:  ['rec.autos' 'comp.windows.x' 'alt.atheism' 'talk.politics.mideast'
 'talk.religion.misc' 'sci.med' 'soc.religion.christian' 'comp.graphics'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'comp.sys.mac.hardware'
 'misc.forsale' 'talk.politics.guns' 'sci.space'
 'comp.sys.ibm.pc.hardware' 'sci.crypt' 'rec.sport.baseball'
 'rec.sport.hockey' 'talk.politics.misc' 'sci.electronics']


In [5]:
#  6. Prepare subset of subset categories alt.athesim, comp.graphics and sci.space

cats = ["alt.atheism", "comp.graphics", "sci.space"]
train_sub = train_df[train_df["category"].isin(cats)]
test_sub = test_df[test_df["category"].isin(cats)]

train_sub.head() , test_sub.head()


(                                                 text  target       category
 3   \nDo you have Weitek's address/phone number?  ...       1  comp.graphics
 4   From article <C5owCB.n3p@world.std.com>, by to...      14      sci.space
 13  \n   {Description of "External Tank" option fo...      14      sci.space
 15  \nDon't be so sure.  Look what happened to Jap...       0    alt.atheism
 16  \nI certainly do use it whenever I have to do ...       1  comp.graphics,
                                                  text  target       category
 2                                 \nIn a word, yes.\n       0    alt.atheism
 9   :  \n: well, i have lots of experience with sc...       1  comp.graphics
 14  \nProbably because it IS rape.\n\n\nSo nothing...       0    alt.atheism
 17  Hello,\ni'm interested in those devices too.\n...       1  comp.graphics
 20  This is an invitation to send articles to the ...       1  comp.graphics)

In [6]:
# 7, 8 Load train subset with the above three categories

train_df = train_sub
test_df = test_sub

In [7]:
train_df.isnull().sum()

text        33
target       0
category     0
dtype: int64

In [8]:
#  drop the the missing value rows

train_df.dropna(subset=["text"],inplace=True)
train_df.head()

Unnamed: 0,text,target,category
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space
13,"\n {Description of ""External Tank"" option fo...",14,sci.space
15,\nDon't be so sure. Look what happened to Jap...,0,alt.atheism
16,\nI certainly do use it whenever I have to do ...,1,comp.graphics


In [9]:
train_df.isnull().sum()

text        0
target      0
category    0
dtype: int64

In [10]:
# 9. Print new training set target labels

print("Target labels new train df: ", train_df["category"].unique())

Target labels new train df:  ['comp.graphics' 'sci.space' 'alt.atheism']


In [11]:
# 10. print news training data of 5th article
print("5th article : \n" , train_df["text"].iloc[4])

5th article : 
 
I certainly do use it whenever I have to do TIFF, and it usually works
very well.  That's not my point.  I'm >philosophically< opposed to it
because of its complexity.

This complexity has led to some programs' poor TIFF writers making
some very bizarre files, other programs' inability to load TIFF
images (though they'll save them, of course), and a general
inability to interchange images between different environments
despite the fact they all think they understand TIFF.

As the saying goes, "It's not me I'm worried about- it's all the
abuse of TIFF over the years, and I chalk it all up to the immense (and
unnecessary) complexity of the format.

In the words of the TIFF 5.0 spec, Appendix G, page G-1 (capitalized
emphasis mine):

"The only problem with this sort of success is that TIFF was designed
to be powerful and flexible, at the expense of simplicity.  It takes a
fair amount of effort to handle all the options currently defined in
this specification (PROBABLY NO 

In [12]:
print("5th article : " , train_df.iloc[4,0])


5th article :  
I certainly do use it whenever I have to do TIFF, and it usually works
very well.  That's not my point.  I'm >philosophically< opposed to it
because of its complexity.

This complexity has led to some programs' poor TIFF writers making
some very bizarre files, other programs' inability to load TIFF
images (though they'll save them, of course), and a general
inability to interchange images between different environments
despite the fact they all think they understand TIFF.

As the saying goes, "It's not me I'm worried about- it's all the
abuse of TIFF over the years, and I chalk it all up to the immense (and
unnecessary) complexity of the format.

In the words of the TIFF 5.0 spec, Appendix G, page G-1 (capitalized
emphasis mine):

"The only problem with this sort of success is that TIFF was designed
to be powerful and flexible, at the expense of simplicity.  It takes a
fair amount of effort to handle all the options currently defined in
this specification (PROBABLY NO A

In [13]:
print(train_df.iloc[4,0])


I certainly do use it whenever I have to do TIFF, and it usually works
very well.  That's not my point.  I'm >philosophically< opposed to it
because of its complexity.

This complexity has led to some programs' poor TIFF writers making
some very bizarre files, other programs' inability to load TIFF
images (though they'll save them, of course), and a general
inability to interchange images between different environments
despite the fact they all think they understand TIFF.

As the saying goes, "It's not me I'm worried about- it's all the
abuse of TIFF over the years, and I chalk it all up to the immense (and
unnecessary) complexity of the format.

In the words of the TIFF 5.0 spec, Appendix G, page G-1 (capitalized
emphasis mine):

"The only problem with this sort of success is that TIFF was designed
to be powerful and flexible, at the expense of simplicity.  It takes a
fair amount of effort to handle all the options currently defined in
this specification (PROBABLY NO APPLICATION DOES

In [14]:
train_df.shape

(1624, 3)

In [15]:
test_df.shape

(1102, 3)

In [16]:
# 13. By using count vectorizor train data into numerical format considering

cv = CountVectorizer()

x_train = cv.fit_transform(train_df["text"])
y_train = train_df["category"]

print(x_train)


  (0, 7674)	1
  (0, 23500)	1
  (0, 10608)	1
  (0, 22974)	1
  (0, 2502)	1
  (0, 16244)	1
  (0, 15161)	1
  (0, 13123)	1
  (0, 21402)	1
  (0, 9999)	1
  (0, 19757)	1
  (0, 11628)	1
  (0, 2293)	1
  (0, 21239)	1
  (0, 5265)	1
  (1, 23500)	1
  (1, 10608)	1
  (1, 9677)	1
  (1, 3405)	1
  (1, 4743)	1
  (1, 14672)	1
  (1, 23255)	2
  (1, 20196)	2
  (1, 5666)	2
  (1, 4729)	1
  :	:
  (1623, 23041)	1
  (1623, 15073)	1
  (1623, 16526)	1
  (1623, 13850)	1
  (1623, 15656)	1
  (1623, 2928)	1
  (1623, 13800)	1
  (1623, 9062)	1
  (1623, 7581)	1
  (1623, 6308)	1
  (1623, 16535)	2
  (1623, 23281)	1
  (1623, 18147)	1
  (1623, 14793)	1
  (1623, 17432)	1
  (1623, 19956)	2
  (1623, 8791)	1
  (1623, 10539)	1
  (1623, 23311)	1
  (1623, 17170)	1
  (1623, 22107)	1
  (1623, 19914)	1
  (1623, 20247)	1
  (1623, 11127)	1
  (1623, 8496)	1


In [17]:
# fit the model
model=BernoulliNB()
model.fit(x_train,y_train)

In [18]:
test_df.isnull().sum()

text        22
target       0
category     0
dtype: int64

In [19]:
test_df.dropna(inplace=True)

In [20]:

test_df.isnull().sum()

text        0
target      0
category    0
dtype: int64

In [24]:
#15. By using countvectorizer convert test data into numeric format considering only

x_test = cv.transform(test_df["text"])
y_test = test_df["category"]


In [25]:
#16. Predic target label for testing set

y_pred = model.predict(x_test)
print(y_pred[:5])


['comp.graphics' 'comp.graphics' 'alt.atheism' 'comp.graphics' 'sci.space']


In [26]:
# 17. Find accuracy score on test set

accuracy = accuracy_score(y_test,y_pred)
print(f"Test set accuracy : {accuracy:.2f}")


Test set accuracy : 0.71


In [27]:
#  18. Use Tfidvectorizer instead of count vectorizer , 

tfv = TfidfVectorizer()

x_tr = tfv.fit_transform(train_df["text"])
y_tr = train_df["category"]

x_te = tfv.transform(test_df["text"])
y_te = test_df["category"]


In [28]:
# 18 b) and use multinomial nb

multi = MultinomialNB()
multi.fit(x_tr,y_tr)



In [29]:
# 18 c) and use predict the x_test

y_predict = multi.predict(x_te)

y_predict[:5]



array(['alt.atheism', 'comp.graphics', 'alt.atheism', 'comp.graphics',
       'sci.space'], dtype='<U13')

In [30]:
# 19. Find test accuary for multinomial nb


ac = accuracy_score(y_te,y_predict)

print(f"Accuaracy of multinomial model : {ac:.2f}")


Accuaracy of multinomial model : 0.87
