* 20 NEWSGROUP DATASET

### Algorithms Used:
* Naive Bayes 
* Linear SVC 
* Logistic Regression

### EXPLORING THE ABOVE CLASSIFIERS

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test  = fetch_20newsgroups(subset='test')

In [2]:
print(f"Number of Training Samples: {len(train['data'])}")
print(f"Number of Testing Samples: {len(test['data'])}")
print(40*'-')

for idx, label in enumerate(train['target_names']):
    print(f'Class {idx:2d} = {label}')

Number of Training Samples: 11314
Number of Testing Samples: 7532
----------------------------------------
Class  0 = alt.atheism
Class  1 = comp.graphics
Class  2 = comp.os.ms-windows.misc
Class  3 = comp.sys.ibm.pc.hardware
Class  4 = comp.sys.mac.hardware
Class  5 = comp.windows.x
Class  6 = misc.forsale
Class  7 = rec.autos
Class  8 = rec.motorcycles
Class  9 = rec.sport.baseball
Class 10 = rec.sport.hockey
Class 11 = sci.crypt
Class 12 = sci.electronics
Class 13 = sci.med
Class 14 = sci.space
Class 15 = soc.religion.christian
Class 16 = talk.politics.guns
Class 17 = talk.politics.mideast
Class 18 = talk.politics.misc
Class 19 = talk.religion.misc


In [3]:
# Classify text with Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create DTM
cv = CountVectorizer(stop_words='english')
train_dtm = cv.fit_transform(train['data'])
test_dtm = cv.transform(test['data'])

# Create Classifier
nb = MultinomialNB()
nb = nb.fit(train_dtm, train['target'])

# Predict and display score
predicted = nb.predict(test_dtm)
scr = 100.0 * nb.score(test_dtm, test['target'])
print(f'NB prediction accuracy = {scr:5.1f}%')

NB prediction accuracy =  80.2%


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create DTM
tf_cv = TfidfVectorizer(stop_words='english')
train_dtm_tf = tf_cv.fit_transform(train['data'])
test_dtm_tf = tf_cv.transform(test['data'])

nb = MultinomialNB()
nb = nb.fit(train_dtm_tf, train['target'])

predicted = nb.predict(test_dtm_tf)
scr = 100.0 * nb.score(test_dtm_tf, test['target'])
print(f'NB (TF-IDF with Stop Words) prediction accuracy = {scr:5.1f}%')

NB (TF-IDF with Stop Words) prediction accuracy =  81.7%


In [6]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C=1000,max_iter=2000)

svc = svc.fit(train_dtm_tf, train['target'])
predicted = svc.predict(test_dtm_tf)

scr = 100.0 * svc.score(test_dtm_tf, test['target'])
print(f'SVC (TF-IDF with Stop Words) prediction accuracy = {scr:5.1f}%')

SVC (TF-IDF with Stop Words) prediction accuracy =  83.1%


In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(train_dtm_tf,train['target'])
predicted = lr.predict(test_dtm_tf)

score = 100 * lr.score(test_dtm_tf,test['target'])
print(f'LR (TF-IDF with Stop Words) prediction accuracy = {score:5.1f}%')

LR (TF-IDF with Stop Words) prediction accuracy =  83.2%


In [10]:
import numpy as np

In [11]:
#Printing top 5 words
all_words = np.array(tf_cv.get_feature_names())

for idx, target in enumerate(train['target_names']):
    top_word_index = np.argsort(nb.coef_[idx])[-5:]
    tn_lst = [word for word in all_words[top_word_index]]
    tn_lst.reverse()

    print(f'\n{target}:')
    print(tn_lst)


alt.atheism:
['keith', 'edu', 'god', 'caltech', 'atheists']

comp.graphics:
['graphics', 'edu', 'image', '3d', 'files']

comp.os.ms-windows.misc:
['windows', 'edu', 'file', 'dos', 'files']

comp.sys.ibm.pc.hardware:
['scsi', 'drive', 'ide', 'card', 'edu']

comp.sys.mac.hardware:
['mac', 'apple', 'edu', 'drive', 'quadra']

comp.windows.x:
['window', 'motif', 'mit', 'server', 'com']

misc.forsale:
['sale', 'edu', '00', 'offer', 'shipping']

rec.autos:
['car', 'com', 'cars', 'edu', 'engine']

rec.motorcycles:
['bike', 'com', 'dod', 'edu', 'ride']

rec.sport.baseball:
['edu', 'baseball', 'year', 'team', 'game']

rec.sport.hockey:
['hockey', 'team', 'game', 'ca', 'edu']

sci.crypt:
['key', 'clipper', 'encryption', 'chip', 'com']

sci.electronics:
['edu', 'com', 'use', 'lines', 'subject']

sci.med:
['pitt', 'edu', 'geb', 'banks', 'gordon']

sci.space:
['space', 'nasa', 'edu', 'henry', 'moon']

soc.religion.christian:
['god', 'jesus', 'christians', 'church', 'edu']

talk.politics.guns:
['gun

## ADDING NEW STOP WORDS AND THEN PREDICTING 

In [16]:
#get current stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#add com, edu and re to stop words
stop_words.extend(['com', 'edu', 're'])

# Create DTM, use custom defined stop words
tf_cv = TfidfVectorizer(stop_words=stop_words)
train_dtm_tf = tf_cv.fit_transform(train['data'])
test_dtm_tf = tf_cv.transform(test['data'])

nb = MultinomialNB()
nb = nb.fit(train_dtm_tf, train['target'])

predicted = nb.predict(test_dtm_tf)
scr = 100.0 * nb.score(test_dtm_tf, test['target'])
print(f'NB (TF-IDF with Stop Words) prediction accuracy = {scr:5.1f}%')

NB (TF-IDF with Stop Words) prediction accuracy =  81.1%


In [17]:
# Display top 5 important words
all_words = np.array(tf_cv.get_feature_names())

for idx, target in enumerate(train['target_names']):
    top_word_index = np.argsort(nb.coef_[idx])[-5:]
    tn_lst = [word for word in all_words[top_word_index]]
    tn_lst.reverse()

    print(f'\n{target}:')
    print(tn_lst)


alt.atheism:
['keith', 'god', 'caltech', 'atheists', 'livesey']

comp.graphics:
['graphics', 'image', '3d', 'files', 'lines']

comp.os.ms-windows.misc:
['windows', 'file', 'dos', 'files', 'driver']

comp.sys.ibm.pc.hardware:
['scsi', 'drive', 'ide', 'card', 'bus']

comp.sys.mac.hardware:
['mac', 'apple', 'drive', 'quadra', 'se']

comp.windows.x:
['window', 'motif', 'mit', 'server', 'widget']

misc.forsale:
['sale', '00', 'offer', 'shipping', 'new']

rec.autos:
['car', 'cars', 'engine', 'article', 'would']

rec.motorcycles:
['bike', 'dod', 'ride', 'bikes', 'motorcycle']

rec.sport.baseball:
['baseball', 'year', 'team', 'game', 'players']

rec.sport.hockey:
['hockey', 'team', 'game', 'ca', 'nhl']

sci.crypt:
['key', 'clipper', 'encryption', 'chip', 'keys']

sci.electronics:
['use', 'lines', 'one', 'subject', 'power']

sci.med:
['pitt', 'geb', 'banks', 'gordon', 'msg']

sci.space:
['space', 'nasa', 'henry', 'moon', 'alaska']

soc.religion.christian:
['god', 'jesus', 'christians', 'church

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000)

lr = lr.fit(train_dtm_tf, train['target'])
predicted = lr.predict(test_dtm_tf)

scr = 100.0 * lr.score(test_dtm_tf, test['target'])
print(f'LR (TF-IDF with Stop Words) prediction accuracy = {scr:5.1f}%')

LR (TF-IDF with Stop Words) prediction accuracy =  84.9%


In [19]:
for idx, target in enumerate(train['target_names']):
    top_word_index = np.argsort(lr.coef_[idx])[-5:]
    tn_lst = [word for word in all_words[top_word_index]]
    tn_lst.reverse()

    print(f'\n{target}:')
    print(tn_lst)


alt.atheism:
['atheism', 'keith', 'god', 'atheists', 'bible']

comp.graphics:
['graphics', '3d', 'image', 'images', '3do']

comp.os.ms-windows.misc:
['windows', 'cica', 'file', '13', 'driver']

comp.sys.ibm.pc.hardware:
['scsi', 'gateway', 'monitor', 'ide', 'pc']

comp.sys.mac.hardware:
['mac', 'apple', 'quadra', 'powerbook', 'duo']

comp.windows.x:
['motif', 'window', 'server', 'widget', 'x11r5']

misc.forsale:
['sale', 'shipping', 'wanted', 'distribution', 'forsale']

rec.autos:
['car', 'cars', 'engine', 'toyota', 'dealer']

rec.motorcycles:
['bike', 'dod', 'bikes', 'motorcycle', 'ride']

rec.sport.baseball:
['baseball', 'phillies', 'team', 'year', 'players']

rec.sport.hockey:
['hockey', 'nhl', 'team', 'game', 'season']

sci.crypt:
['clipper', 'key', 'encryption', 'chip', 'crypto']

sci.electronics:
['circuit', 'electronics', 'mhz', 'power', 'radar']

sci.med:
['doctor', 'msg', 'geb', 'disease', 'banks']

sci.space:
['space', 'orbit', 'moon', 'launch', 'dc']

soc.religion.christian

### N-GRAMS (For phrases)

In [20]:
my_text = 'This course introduces many concepts in data science.'

# Tokenize sentance
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,3))

# Analyze sentance
tk_func = cv.build_analyzer()

# Display n-grams
print(tk_func(my_text))

['this', 'course', 'introduces', 'many', 'concepts', 'in', 'data', 'science', 'this course', 'course introduces', 'introduces many', 'many concepts', 'concepts in', 'in data', 'data science', 'this course introduces', 'course introduces many', 'introduces many concepts', 'many concepts in', 'concepts in data', 'in data science']


In [21]:
# Tokenize sentence
cv = cv.fit([my_text])

# Sort tokens
import operator
my_voc = sorted(cv.vocabulary_.items(), key=operator.itemgetter(1))

# Display token mapping
print('Token mapping:')
print(40*'-')

for tokens, rank in my_voc:
    print(rank, tokens)

# Display new sentence
print(40*'-')
out_list = ['This course is data science!']

# Transform new sentence to original sentence DTM
xsm = cv.transform(out_list)
print(out_list)

# Display count vector indices for new sentance tokens
print(40*'-')
print(xsm.todense())

Token mapping:
----------------------------------------
0 concepts
1 concepts in
2 concepts in data
3 course
4 course introduces
5 course introduces many
6 data
7 data science
8 in
9 in data
10 in data science
11 introduces
12 introduces many
13 introduces many concepts
14 many
15 many concepts
16 many concepts in
17 science
18 this
19 this course
20 this course introduces
----------------------------------------
['This course is data science!']
----------------------------------------
[[0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0]]


### Classification using N-GRAMS

In [22]:
# Create DTM
tf_cv = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
train_dtm_tf = tf_cv.fit_transform(train['data'])
test_dtm_tf = tf_cv.transform(test['data'])

nb = MultinomialNB()
nb = nb.fit(train_dtm_tf, train['target'])

predicted = nb.predict(test_dtm_tf)
scr = 100.0 * nb.score(test_dtm_tf, test['target'])
print(f'NB (TF-IDF with ngram_range 1-2) prediction accuracy = {scr:5.1f}%')

NB (TF-IDF with ngram_range 1-2) prediction accuracy =  80.7%


## SENTIMENT ANALYSIS

In [25]:
import random
from sklearn.model_selection import train_test_split
from nltk.corpus import movie_reviews
#load movie reviews, each review is a list of words
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

print ("Number of Reviews:", len(documents))
#shuffle reviews to mix negative and positive reviews
#set random seed for reproducibility
random.seed(23)
random.shuffle(documents)

#list to store all review text
text_data = []
#label
label = []
for i in range(len(documents)):
    #join list of words to create a review and add to text_data
    text_data.append(' '.join(documents[i][0]))
    #map neg to 0, pos to 1 and add to label
    label.append(0 if documents[i][1]=='neg' else 1)

print("Number of Negative Reviews:", label.count(0))
print("Number of Positive Reviews:", label.count(1))    

#split to train and text
mvr_train, mvr_test, y_train, y_test = train_test_split(text_data, label, test_size=0.25, random_state=23)

#print one example review in the training text set
print("Sample Review:")
print('Label:', y_train[0])
print(mvr_train[0])

Number of Reviews: 2000
Number of Negative Reviews: 1000
Number of Positive Reviews: 1000
Sample Review:
Label: 0
not a great twelve months for either of the principals from this movie . earlier this year , nora ephron wrote and produced one of the year ' s least likeable " comedies " called hanging up , featuring a bunch of annoying women ( ironically , lisa kudrow played one in that film as well ) who barely have time to care about anyone but themselves . ick . . . real sweet stuff . but her little unsuccessful project was nothing compared to what john travolta went through earlier this summer , with a film entitled battlefield earth . i seemed to be the only person on this planet who somehow appreciated the film , even if it was on a " cheese " factor , as everybody , and i mean everybody else , pretty much classified the movie as one of the worst disasters of all time . yipes . . . another beauty . so what happens when you put these two people in the same room and come out with a m

In [26]:
# Naive Bayes pipeline to classify
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

tf_cv = TfidfVectorizer(stop_words='english')
train_dtm_tf = tf_cv.fit_transform(mvr_train)
test_dtm_tf = tf_cv.transform(mvr_test)

# Fit model, predict, and display results
nb = MultinomialNB()
nb = nb.fit(train_dtm_tf, y_train)
y_pred = nb.predict(test_dtm_tf)
scr = 100.0 * nb.score(test_dtm_tf, y_test)
print(f'NB (TF-IDF with stop words) prediction accuracy = {scr:5.1f}%')
print(metrics.classification_report(y_test, y_pred, target_names = ['Negative', 'Positive']))

NB (TF-IDF with stop words) prediction accuracy =  78.6%
              precision    recall  f1-score   support

    Negative       0.79      0.80      0.79       256
    Positive       0.79      0.77      0.78       244

    accuracy                           0.79       500
   macro avg       0.79      0.79      0.79       500
weighted avg       0.79      0.79      0.79       500



In [31]:
all_words = np.array(tf_cv.get_feature_names())

top_word_index = np.argsort(nb.coef_[0])[-20:]
tn_lst = [word for word in all_words[top_word_index]]
tn_lst.reverse()

print(f'\nTop 20 Words of Positive Reviews:')
print(tn_lst)


Top 20 Words of Positive Reviews:
['film', 'movie', 'like', 'story', 'life', 'good', 'just', 'time', 'character', 'characters', 'films', 'great', 'best', 'does', 'way', 'love', 'really', 'people', 'man', 'little']


In [32]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000)

lr = lr.fit(train_dtm_tf, y_train)
predicted = lr.predict(test_dtm_tf)

scr = 100.0 * lr.score(test_dtm_tf, y_test)
print(f'LR (TF-IDF with Stop Words) prediction accuracy = {scr:5.1f}%')

LR (TF-IDF with Stop Words) prediction accuracy =  82.2%


In [33]:
top_word_index = np.argsort(lr.coef_[0])[-20:]
tn_lst = [word for word in all_words[top_word_index]]
tn_lst.reverse()

print(f'\nTop 20 Words of Positive Reviews:')
print(tn_lst)


Top 20 Words of Positive Reviews:
['great', 'fun', 'life', 'hilarious', 'memorable', 'overall', 'quite', 'different', 'good', 'terrific', 'especially', 'trek', 'works', 'seen', 'performances', 'perfect', 'perfectly', 'comic', 'town', 'gives']


In [34]:
#reverse label value so that negative reviews have label 1
y_train_reverse = [0 if y==1 else 1 for y in y_train]
lr = lr.fit(train_dtm_tf, y_train_reverse)

top_word_index = np.argsort(lr.coef_[0])[-20:]
tn_lst = [word for word in all_words[top_word_index]]
tn_lst.reverse()

print(f'\nTop 20 words of Negative Reviews:')
print(tn_lst)


Top 20 words of Negative Reviews:
['bad', 'worst', 'plot', 'supposed', 'unfortunately', 'harry', 'boring', 'script', 'stupid', 'reason', 'poor', 'awful', 'waste', 'cheap', 'attempt', 'jakob', 'dull', 'lame', 'looks', 'better']


## STEMMING

In [35]:
import string
import nltk
from nltk.stem.porter import PorterStemmer

# Define function to tokenize text and apply stemmer
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]

    stemmer = PorterStemmer()
    stems = map(stemmer.stem, tokens)
    return stems

# use custom tokenize when creating vectorizer
tf_cv = TfidfVectorizer(tokenizer=tokenize)
train_dtm_tf = tf_cv.fit_transform(mvr_train)
test_dtm_tf = tf_cv.transform(mvr_test)

lr = LogisticRegression(C=1000)

lr = lr.fit(train_dtm_tf, y_train)
predicted = lr.predict(test_dtm_tf)

scr = 100.0 * lr.score(test_dtm_tf, y_test)
print(f'LR (TF-IDF with Stemming) prediction accuracy = {scr:5.1f}%')

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Palla Anuraag Sharma/nltk_data'
    - 'C:\\Anaconda3\\nltk_data'
    - 'C:\\Anaconda3\\share\\nltk_data'
    - 'C:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Palla Anuraag Sharma\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************
