In [1]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [5]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
np.random.seed(500)

In [49]:
# train = pd.read_csv(r"train.csv",encoding='latin-1')
file_path = '/content/drive/My Drive/Colab Notebooks/Multi_Text_Classification_Model/train.csv'
train = pd.read_csv(file_path,encoding='latin-1')

In [14]:
train

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [15]:
# Dropping all the rows with empty entries in 'text'
train['text'].dropna(inplace=True)

In [16]:
# Converting all entries in 'text' to lowercase in case they are not
train['text'] = [entry.lower() for entry in train['text']]

In [17]:
# Tokenizing the sentences
train['text']= [word_tokenize(entry) for entry in train['text']]

In [18]:
train.head

In [22]:
# Tag-mapping

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [23]:
print(tag_map)
word_Lemmatized = WordNetLemmatizer()
train['text']

defaultdict(<function <lambda> at 0x7b971661e9e0>, {'J': 'a', 'V': 'v', 'R': 'r'})


0       [tv, future, in, the, hands, of, viewers, with...
1       [worldcom, boss, left, books, alone, former, w...
2       [tigers, wary, of, farrell, gamble, leicester,...
3       [yeading, face, newcastle, in, fa, cup, premie...
4       [ocean, s, twelve, raids, box, office, ocean, ...
                              ...                        
2220    [cars, pull, down, us, retail, figures, us, re...
2221    [kilroy, unveils, immigration, policy, ex-chat...
2222    [rem, announce, new, glasgow, concert, us, ban...
2223    [how, political, squabbles, snowball, it, s, b...
2224    [souness, delight, at, euro, progress, boss, g...
Name: text, Length: 2225, dtype: object

In [24]:
for index, entry in enumerate(train['text']):
    Final_words = []
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,pos = tag_map[tag[0]])
            Final_words.append(word_Final)
    train.loc[index,'final'] = str(Final_words)

In [25]:
train

Unnamed: 0,category,text,final
0,tech,"[tv, future, in, the, hands, of, viewers, with...","['tv', 'future', 'hand', 'viewer', 'home', 'th..."
1,business,"[worldcom, boss, left, books, alone, former, w...","['worldcom', 'bos', 'leave', 'book', 'alone', ..."
2,sport,"[tigers, wary, of, farrell, gamble, leicester,...","['tiger', 'wary', 'farrell', 'gamble', 'leices..."
3,sport,"[yeading, face, newcastle, in, fa, cup, premie...","['yeading', 'face', 'newcastle', 'fa', 'cup', ..."
4,entertainment,"[ocean, s, twelve, raids, box, office, ocean, ...","['ocean', 'twelve', 'raid', 'box', 'office', '..."
...,...,...,...
2220,business,"[cars, pull, down, us, retail, figures, us, re...","['car', 'pull', 'u', 'retail', 'figure', 'u', ..."
2221,politics,"[kilroy, unveils, immigration, policy, ex-chat...","['kilroy', 'unveils', 'immigration', 'policy',..."
2222,entertainment,"[rem, announce, new, glasgow, concert, us, ban...","['rem', 'announce', 'new', 'glasgow', 'concert..."
2223,politics,"[how, political, squabbles, snowball, it, s, b...","['political', 'squabble', 'snowball', 'become'..."


In [26]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(train['final'],train['category'],test_size=0.3)

### Encoding using TFIDF

In [27]:
# Encoder converts string-type labels into object-type
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [28]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(train['final'])

In [29]:
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### Encoding using CountVectorizer

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
vectorizer = CountVectorizer(max_features=5000)

vectorizer.fit(train['final'])

In [32]:
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)



In [33]:
# Encoding the Document
Train_X_CV = vectorizer.transform(Train_X)
Test_X_CV = vectorizer.transform(Test_X)


In [34]:
print("Encoded Documents are:")
print(Train_X_CV.toarray())
print(Test_X_CV.toarray())

Encoded Documents are:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Naive Bayes(TFIDF)

In [35]:
# Implementing the Multinomial Naive Bayes model
NB = naive_bayes.MultinomialNB()
NB.fit(Train_X_Tfidf,Train_Y)

In [36]:
# Predicting the labels on validation dataset
pred_NB = NB.predict(Test_X_Tfidf)

In [37]:
# Using accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(pred_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  97.30538922155688


## Naive Bayes(CountVectorizer)

In [38]:
# Implementing the Multinomial Naive Bayes model
NB = naive_bayes.MultinomialNB()
NB.fit(Train_X_CV,Train_Y)

In [39]:
# Predicting the labels on validation dataset
pred_NB = NB.predict(Test_X_Tfidf)

In [40]:
# Using accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(pred_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  97.75449101796407


## SVM(TFIDF)

In [41]:
#Implementing SVM and setting the parameters
SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')

In [42]:
SVM.fit(Train_X_Tfidf, Train_Y)

In [43]:
pred_SVM = SVM.predict(Test_X_Tfidf)

In [44]:
print("SVM Accuracy Score -> ",accuracy_score(pred_SVM, Test_Y)*100)

SVM Accuracy Score ->  98.35329341317365


## SVM(CountVectorizer)

In [45]:
#Implementing SVM and setting the parameters
SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')

In [46]:
SVM.fit(Train_X_CV, Train_Y)

In [47]:
pred_SVM = SVM.predict(Test_X_CV)

In [48]:
print("SVM Accuracy Score -> ",accuracy_score(pred_SVM, Test_Y)*100)

SVM Accuracy Score ->  97.0059880239521
