Step 1. Add the Required Libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sharm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

STEP -2: Set random seed

In [3]:
np.random.seed(500)

Step 3: Read Dataset

In [4]:
####### function for reading dataset 
import re , datetime
import sklearn.datasets as skd
import pandas as pd
import nltk
# import StanfordTokenizer() method from nltk 
#from nltk.tokenize.stanford import StanfordTokenizer 

def read_datasets():    
    categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
    #categories = ['sci.med']
    c=len(categories)
    train = skd.load_files('C:\\Users\\sharm\\Documents\\Iswc2020Matcher\\News20\\data\\20news-bydate-train', categories= categories, encoding= 'ISO-8859-1')
    test = skd.load_files('C:\\Users\\sharm\\Documents\\Iswc2020Matcher\\News20\\data\\20news-bydate-test',categories= categories, encoding= 'ISO-8859-1')
    train = pd.DataFrame({'data': train.data, 'target': train.target})
    test = pd.DataFrame({'data': test.data, 'target': test.target})
    return train,test

In [5]:
train,test = read_datasets()

Step 4: Preprocessing 

    Remove Blank rows in Data, if any 
    Change all the text to lower case  
    Word Tokenization   
    Remove Stop words
    Remove Non-alpha text
    Word Lemmatization

In [6]:
# Step - a : Remove blank rows if any.
train['data'].dropna(inplace=True)
test['data'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
train['data'] = [entry.lower() for entry in train['data']]
test['data'] = [entry.lower() for entry in test['data']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
train['data']= [word_tokenize(entry) for entry in train['data']]
test['data']= [word_tokenize(entry) for entry in test['data']]


In [7]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(train['data']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    train.loc[index,'text_final'] = str(Final_words)

In [8]:
train.head()

Unnamed: 0,data,target,text_final
0,"[from, :, dpc47852, @, uxa.cso.uiuc.edu, (, da...",2,"['daniel', 'paul', 'checkman', 'subject', 'msg..."
1,"[from, :, yoo, @, engr.ucf.edu, (, hoi, yoo, )...",1,"['yoo', 'hoi', 'yoo', 'subject', 'look', 'usa'..."
2,"[from, :, fernandeza, @, merrimack.edu, subjec...",3,"['fernandeza', 'subject', 'arrogance', 'christ..."
3,"[from, :, mcelwre, @, cnsvax.uwec.edu, subject...",2,"['mcelwre', 'subject', 'natural', 'remedy', 'o..."
4,"[from, :, mathew, <, mathew, @, mantis.co.uk, ...",0,"['mathew', 'mathew', 'subject', 'inimitable', ..."


In [9]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(test['data']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    test.loc[index,'text_final'] = str(Final_words)

In [10]:
test.head()

Unnamed: 0,data,target,text_final
0,"[from, :, koberg, @, spot.colorado.edu, (, all...",3,"['koberg', 'allen', 'koberg', 'subject', 'bibl..."
1,"[from, :, bobbe, @, vice.ico.tek.com, (, rober...",0,"['bobbe', 'robert', 'beauchaine', 'subject', '..."
2,"[from, :, joe, @, erix.ericsson.se, (, joe, ar...",3,"['joe', 'joe', 'armstrong', 'subject', 'angel'..."
3,"[from, :, sdl, @, linus.mitre.org, (, steven, ...",2,"['sdl', 'steven', 'litvintchouk', 'subject', '..."
4,"[from, :, kxgst1+, @, pitt.edu, (, kenneth, gi...",2,"['kenneth', 'gilbert', 'subject', 'pregnency',..."


STEP -5: Prepare Train and Test Data sets

In [11]:
Train_X = train.text_final
Test_X  = test.text_final
Train_Y = train.target
Test_Y  = test.target

STEP -6: Encoding

In [12]:
Encoder = LabelEncoder()
Train_Yq = Encoder.fit_transform(Train_Y)
Test_Yq = Encoder.fit_transform(Test_Y)

STEP -7: Word Vectorization

In [13]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(train['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [14]:
#print(Tfidf_vect.vocabulary_)

In [15]:
#print(Train_X_Tfidf)
#(0, 4978) 0.013641666546980962
#1. Row number of ‘Train_X_Tfidf’, 
#2: Unique Integer number of each word in the first row, 
#3: Score calculated by TF-IDF Vectorizer

STEP -8: Use the ML Algorithms to Predict the outcome

In [16]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  88.41544607190413


In [17]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  89.61384820239681


## Word2vec embeddings

In [21]:
from gensim.models import Word2Vec
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, min_count=1)

In [23]:
vector = model.wv['cat']

In [24]:
vector

array([ 2.6889052e-03, -3.2475619e-03,  4.4311364e-03,  4.7342293e-03,
        2.3851776e-03,  5.2145694e-04, -3.2955538e-03, -3.9260350e-03,
        1.0875356e-03, -1.6195885e-03, -1.6356009e-03, -3.9639357e-03,
        3.2271182e-06,  1.2043347e-03,  3.7724508e-03, -1.8608301e-04,
       -9.0082060e-04,  3.5956078e-03, -2.4978491e-03, -4.2991722e-03,
        5.2830845e-04, -2.4395101e-03, -4.4112029e-03,  9.1889451e-05,
        2.7890888e-03, -7.4143399e-04, -2.2253366e-03,  5.4945255e-04,
        4.2316802e-03,  1.1988498e-03, -2.7603793e-03,  3.6878963e-03,
       -4.2051692e-03, -3.0902606e-03,  1.2108495e-03,  4.3116491e-03,
        3.1942038e-03,  5.2693096e-04, -2.5778159e-03,  1.8131731e-03,
       -2.3540966e-03, -1.0483249e-03, -1.6874263e-04,  1.6546038e-03,
       -2.2740050e-03, -4.2057489e-03,  3.4335261e-04,  3.4583043e-03,
       -2.7334466e-04, -1.3224296e-03, -7.7152258e-04, -4.4255477e-04,
       -1.4621363e-03,  7.7722181e-04, -4.9434323e-03,  4.8588105e-03,
      