# Task 1 : Data selection

# Task 1.1 : Display all 20 categories present in the input dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroup_data = fetch_20newsgroups()
newsgroup_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Task 1.2 : Select the input data based on two categories 
## a) 'rec.motorcycles'
## b)'sci.electronics'

In [2]:
categories = ['rec.motorcycles', 'sci.electronics']
train = fetch_20newsgroups(subset='train',categories=categories)

In [3]:
test = fetch_20newsgroups(subset='test',categories=categories)

# Task 2 : Training and Testing 

## convert text data into vector form.

In [4]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(
        sublinear_tf=True, # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
        max_df=0.5, # ignore terms that appaer in more than that fraction of docs 
        min_df=1, #  ignore terms that have lower document frequency 
        stop_words=stopwords.words('english'),
        use_idf=True, # Enable inverse-document-frequency reweighting.
        smooth_idf=True) # Smooth idf weights by adding one to document frequencies
X_train = vectorizer.fit_transform(train.data)
print("X_train n_samples: %d, n_features: %d" % X_train.shape)


X_train n_samples: 1189, n_features: 19592


[nltk_data] Downloading package stopwords to C:\Users\ABIKJITH
[nltk_data]     REDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
X_train.data

array([0.04065118, 0.05379061, 0.06787162, ..., 0.10105695, 0.06094588,
       0.08078682])

In [7]:
X_test = vectorizer.transform(test.data)                      #test data for vectorization
print("X_test n_samples: %d, n_features: %d" % X_test.shape)

X_test n_samples: 791, n_features: 19592


In [8]:
X_test

<791x19592 sparse matrix of type '<class 'numpy.float64'>'
	with 64986 stored elements in Compressed Sparse Row format>

In [9]:
stop_words = stopwords.words('english')
stop_words[:9]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you']

## Task 2.2 : Apply logistic regression 

In [10]:
# Train a Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(X_train,train.target) 

## Task 2.3 : Print the first 30  samples of predicted and actual output 

In [11]:
pred = clf.predict(X_test)

In [12]:
pred.shape

(791,)

In [13]:
pred[0:30]

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0], dtype=int64)

In [14]:
import pandas as pd
df = pd.DataFrame({"Actual":test.target,"Predicted":pred})

In [15]:
df.head(30)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,1,1
8,0,0
9,0,0


## Task 2.4 : Find accuracy, precision, recall, and F1 score 

In [16]:
import sklearn
acc = sklearn.metrics.accuracy_score(test.target, pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec = round(sklearn.metrics.precision_score(test.target, pred),2)
# recall = ability of classifier to find all the positive samples
rec = round(sklearn.metrics.recall_score(test.target, pred),2)
f1 = round(sklearn.metrics.f1_score(test.target, pred),2)
print('accuracy =',acc, '\nprecision =', prec, '\nrecall =', rec, '\nf1 =',f1)

accuracy = 0.9860935524652339 
precision = 0.98 
recall = 0.99 
f1 = 0.99


# Task 3 : Implementation of  Naive Bayes Algorithm

## Task 3.2 : Apply Naive Bayes algorithm 

In [17]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, train.target)


## Task 3.3 : Print the first 30 samples of predicted and actual data 

In [18]:
# Predicting our test data
y_pred = clf.predict(X_test)

In [19]:
import pandas as pd
df1 = pd.DataFrame({"Actual":test.target,"Predicted":y_pred})

In [20]:
df1.head(30)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,1,1
8,0,0
9,0,0


In [21]:
acc_n = sklearn.metrics.accuracy_score(test.target, y_pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec_n = round(sklearn.metrics.precision_score(test.target, y_pred),2)
# recall = ability of classifier to find all the positive samples
rec_n = round(sklearn.metrics.recall_score(test.target, y_pred),2)
f1_n = round(sklearn.metrics.f1_score(test.target, y_pred),2)
print('accuracy =',acc_n, '\nprecision =', prec_n, '\nrecall =', rec_n, '\nf1 =',f1_n)

accuracy = 0.9797724399494311 
precision = 0.99 
recall = 0.97 
f1 = 0.98


## Task 3.4 : compare the results of logistic regresssion and Naive Bayes for text clasiification 

In [22]:
dff = pd.DataFrame({"Accuracy":[acc, acc_n], "Precision":[prec, prec_n], "recall":[rec, rec_n], "F1-Score":[f1,f1_n]}, index = ["Logistic Regression","Naive Bayes"])

In [23]:
dff

Unnamed: 0,Accuracy,Precision,recall,F1-Score
Logistic Regression,0.986094,0.98,0.99,0.99
Naive Bayes,0.979772,0.99,0.97,0.98


In [24]:
print("Even though both  have the approximately same results. With regards to precision score we can say Naive Bayes is a better classifier")

Even though both  have the approximately same results. With regards to precision score we can say Naive Bayes is a better classifier


# Task 4: Applying classification on different categories

In [25]:
newsgroup_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Task 4.1 Select the input based on following two categories only
## a) 'rec.sport.baseball'
## b) 'comp.sys.mac.hardware'

In [26]:
categories = ['rec.sport.baseball', 'comp.sys.mac.hardware']
train = fetch_20newsgroups(subset='train',categories=categories)

In [27]:
test = fetch_20newsgroups(subset='test',categories=categories)

## Task 4.2 : Consider and perform steps of  tasks 2 and 3 for above categories  

In [28]:
nltk.download('stopwords')
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(
        sublinear_tf=True, # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
        max_df=0.5, # ignore terms that appaer in more than that fraction of docs 
        min_df=1, #  ignore terms that have lower document frequency 
        stop_words=stopwords.words('english'),
        use_idf=True, # Enable inverse-document-frequency reweighting.
        smooth_idf=True) # Smooth idf weights by adding one to document frequencies
X_train = vectorizer.fit_transform(train.data)
print("X_train n_samples: %d, n_features: %d" % X_train.shape)

X_train n_samples: 1175, n_features: 17300


[nltk_data] Downloading package stopwords to C:\Users\ABIKJITH
[nltk_data]     REDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
X_test = vectorizer.transform(test.data)                      #test data for vectorization
print("X_test n_samples: %d, n_features: %d" % X_test.shape)

X_test n_samples: 782, n_features: 17300


In [30]:
# Train a Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(X_train,train.target) 

In [31]:
pred = clf.predict(X_test)

In [32]:
df4 = pd.DataFrame({"Actual":test.target,"Predicted":pred})

In [33]:
df4.head(30)

Unnamed: 0,Actual,Predicted
0,0,0
1,1,1
2,1,1
3,0,0
4,1,1
5,1,1
6,0,0
7,1,1
8,0,0
9,1,1


In [34]:
acc1 = sklearn.metrics.accuracy_score(test.target, pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec1 = round(sklearn.metrics.precision_score(test.target, pred),2)
# recall = ability of classifier to find all the positive samples
rec1 = round(sklearn.metrics.recall_score(test.target, pred),2)
f11 = round(sklearn.metrics.f1_score(test.target, pred),2)
print('accuracy =',acc1, '\nprecision =', prec1, '\nrecall =', rec1, '\nf1 =',f11)

accuracy = 0.9910485933503836 
precision = 0.99 
recall = 0.99 
f1 = 0.99


In [35]:
#Naive Bayes

clf = MultinomialNB().fit(X_train, train.target)

In [36]:
y_pred = clf.predict(X_test)

In [37]:
df5 = pd.DataFrame({"Actual":test.target,"Predicted":y_pred})

In [38]:
df5.head(30)

Unnamed: 0,Actual,Predicted
0,0,0
1,1,1
2,1,1
3,0,0
4,1,1
5,1,1
6,0,0
7,1,1
8,0,0
9,1,1


In [39]:
acc_n1 = sklearn.metrics.accuracy_score(test.target, y_pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec_n1 = round(sklearn.metrics.precision_score(test.target, y_pred),2)
# recall = ability of classifier to find all the positive samples
rec_n1 = round(sklearn.metrics.recall_score(test.target, y_pred),2)
f1_n1 = round(sklearn.metrics.f1_score(test.target, y_pred),2)
print('accuracy =',acc_n1, '\nprecision =', prec_n1, '\nrecall =', rec_n1, '\nf1 =',f1_n1)

accuracy = 0.9872122762148338 
precision = 0.98 
recall = 0.99 
f1 = 0.99


In [40]:
dff1 = pd.DataFrame({"Accuracy":[acc1, acc_n1], "Precision":[prec1, prec_n1], "recall":[rec1, rec_n1], "F1-Score":[f11,f1_n1]}, index = ["Logistic Regression","Naive Bayes"])

In [41]:
dff1

Unnamed: 0,Accuracy,Precision,recall,F1-Score
Logistic Regression,0.991049,0.99,0.99,0.99
Naive Bayes,0.987212,0.98,0.99,0.99


In [42]:
print("In this also we can see that both the models have approximately same performance. But depending upon the precision score Logistic Regression is the best classifier")

In this also we can see that both the models have approximately same performance. But depending upon the precision score Logistic Regression is the best classifier


# Task 5 : Naive Bayes algorithm 

In [43]:
new = ['What are the different parts of a computer?', 'Playing baseball is good for health', 
            'In which games you are intrested?', 'The team might not win if there is rain']

In [44]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

clf = MultinomialNB().fit(X_train, train.target)

y_pred = clf.predict(X_test)

In [45]:
new_tfidf = vectorizer.transform(new)

In [46]:
pred = clf.predict(new_tfidf)

In [47]:
for i,j in zip(new,pred):
    print(i,"----",train.target_names[j])

What are the different parts of a computer? ---- comp.sys.mac.hardware
Playing baseball is good for health ---- rec.sport.baseball
In which games you are intrested? ---- rec.sport.baseball
The team might not win if there is rain ---- rec.sport.baseball


## Task 6 : Classification on all input categories 

In [48]:
newsgroup_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [49]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
train = fetch_20newsgroups(subset='train',categories=categories)

In [50]:
test = fetch_20newsgroups(subset='test',categories=categories)

In [51]:
X_train = vectorizer.fit_transform(train.data)
print("X_train n_samples: %d, n_features: %d" % X_train.shape)

X_train n_samples: 11314, n_features: 129958


In [52]:
X_test = vectorizer.transform(test.data)                      #test data for vectorization
print("X_test n_samples: %d, n_features: %d" % X_test.shape)

X_test n_samples: 7532, n_features: 129958


In [53]:
clf = LogisticRegression().fit(X_train,train.target)

In [54]:
pred = clf.predict(X_test)

In [55]:
df6 = pd.DataFrame({"Actual":test.target,"Predicted":pred})

In [56]:
df6.head(30)

Unnamed: 0,Actual,Predicted
0,7,7
1,5,1
2,0,0
3,17,18
4,19,0
5,13,13
6,15,15
7,15,2
8,5,5
9,1,1


In [57]:
acc_a = sklearn.metrics.accuracy_score(test.target, pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec_a = round(sklearn.metrics.precision_score(test.target, pred,average='macro'),2)
# recall = ability of classifier to find all the positive samples
rec_a = round(sklearn.metrics.recall_score(test.target, pred, average='macro'),2)
f1_a = round(sklearn.metrics.f1_score(test.target, pred, average='macro'),2)
print('accuracy =',acc_a, '\nprecision =', prec_a, '\nrecall =', rec_a, '\nf1 =',f1_a)

accuracy = 0.8454593733404142 
precision = 0.85 
recall = 0.84 
f1 = 0.84


In [58]:
#Naive Bayes

clf = MultinomialNB().fit(X_train, train.target)

In [59]:
y_pred = clf.predict(X_test)

In [60]:
df7 = pd.DataFrame({"Actual":test.target,"Predicted":y_pred})

In [61]:
df7.head(30)

Unnamed: 0,Actual,Predicted
0,7,7
1,5,11
2,0,0
3,17,17
4,19,0
5,13,13
6,15,15
7,15,2
8,5,5
9,1,1


In [62]:
acc_an = sklearn.metrics.accuracy_score(test.target, y_pred)
# precision = ability of classifier not to label as positive a sample that is negative
prec_an = round(sklearn.metrics.precision_score(test.target, y_pred,average='macro'),2)
# recall = ability of classifier to find all the positive samples
rec_an = round(sklearn.metrics.recall_score(test.target, y_pred, average='macro'),2)
f1_an = round(sklearn.metrics.f1_score(test.target, y_pred, average='macro'),2)
print('accuracy =',acc_an, '\nprecision =', prec_an, '\nrecall =', rec_an, '\nf1 =',f1_an)

accuracy = 0.813595326606479 
precision = 0.84 
recall = 0.8 
f1 = 0.79


In [63]:
dffa = pd.DataFrame({"Accuracy":[acc_a, acc_an], "Precision":[prec_a, prec_an], "recall":[rec_a, rec_an], "F1-Score":[f1_a, f1_an]}, index = ["Logistic Regression","Naive Bayes"])

In [64]:
dffa

Unnamed: 0,Accuracy,Precision,recall,F1-Score
Logistic Regression,0.845459,0.85,0.84,0.84
Naive Bayes,0.813595,0.84,0.8,0.79


In [65]:
print("For above we can see logistic regression is better")

For above we can see logistic regression is better
