In [16]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import  CountVectorizer,TfidfTransformer,TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import svm
import time 
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
from sklearn.naive_bayes import GaussianNB,MultinomialNB

In [2]:
train_dataframe = pd.read_csv('../Data/training.csv')
test_dataframe = pd.read_csv('../Data/testing.csv')

### Steps followed
Firstly, we will clean our data then we need to convert our cleaned text data into TFIDF vectors and then we will use those TFIDF vector as input to classifier. We will first find optimal parameters for TFIDF and then using those paramaters, we will find best hyperparameters for our model. 

In [3]:
train_dataframe=train_dataframe[['Category','Article']]
test_dataframe=test_dataframe[['Category','Article']]

In [4]:
train_dataframe.head(4)

Unnamed: 0,Category,Article
0,Entertainment,"A dash of stand up comedy, lots of improvisat..."
1,Entertainment,Actor Tamannaah Bhatia is excited to have sig...
2,Entertainment,They say you never forget your first. Tri-city...
3,Entertainment,It was in the year 1968 when the Beatles were...


### Cleaning Text

In [5]:
lemmatizer=WordNetLemmatizer() #For words Lemmatization
stop_words=set(stopwords.words('english'))

In [7]:
def TokenizeText(text):
    ''' 
     Tokenizes text by removing various stopwords and lemmatizing them
    '''
    text=str(text)
    text=text.lower()
    text=re.sub('[^A-Za-z0-9\s]+', '', text)
    word_list=word_tokenize(text)
    word_list_final=[]
    for word in word_list:
        if word not in stop_words:
            word_list_final.append(lemmatizer.lemmatize(word))
    return " ".join(word_list_final)

In [8]:
%%time
train_dataframe.Article = train_dataframe.Article.apply(TokenizeText)

CPU times: user 6.7 s, sys: 60.3 ms, total: 6.76 s
Wall time: 6.89 s


In [9]:
test_dataframe.Article = test_dataframe.Article.apply(TokenizeText)

In [10]:
train_dataframe.head(2)

Unnamed: 0,Category,Article
0,Entertainment,dash stand comedy lot improvisation ton audien...
1,Entertainment,actor tamannaah bhatia excited signed acclaime...


## Finding optimal TFIDF paramters

In [11]:
temp_tf= TfidfVectorizer()
temp_tf.fit(train_dataframe.Article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
len(temp_tf.vocabulary_)

37107

In [17]:

pipe = Pipeline([
...     ('vect', TfidfVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', MultinomialNB()),
... ])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (1000, 10000, 20000, 40000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
}

In [18]:
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)

In [19]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe.steps])
print("parameters: {}".format(parameters))
t0 = time.time()
grid_search.fit(train_dataframe.Article,train_dataframe.Category)

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (1000, 10000, 20000, 40000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False)}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.7min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                     

In [20]:
grid_search.best_score_

0.8069686411149826

In [21]:
grid_search.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75,
                                 max_features=40000, min_df=1,
                                 ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=Fa

### Now, we have found the best parameters for the TFIDF. We will now  find best parameeters for Naivebayes

In [24]:
tfidfvectorizer=grid_search.best_estimator_[0]

In [58]:
## we will combine train and test data to obtain merged Articles. Then we would fit above found TFIDf on given articles. 

In [25]:
tfidfvectorizer.fit(pd.concat([train_dataframe,test_dataframe])['Article'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.75,
                max_features=40000, min_df=1, ngram_range=(1, 2), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [26]:
x_train=tfidfvectorizer.transform(train_dataframe['Article'].values)
x_test=tfidfvectorizer.transform(test_dataframe['Article'].values)
y_train=train_dataframe['Category'].values
y_test=test_dataframe['Category'].values

In [27]:
x_train.shape,x_test.shape

((1435, 40000), (354, 40000))

In [31]:
%%time
parameters = {'alpha':[0.1,0.5,1,1.5,2,5],'fit_prior':[True,False]}
bayes=MultinomialNB()
clf = GridSearchCV(bayes, parameters,verbose=1,n_jobs=-1,scoring='accuracy')
clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
CPU times: user 75.5 ms, sys: 7.43 ms, total: 82.9 ms
Wall time: 890 ms


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0.1, 0.5, 1, 1.5, 2, 5],
                         'fit_prior': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [32]:
clf.best_score_

0.8104529616724738

In [33]:
print ("Best paramters for naive bayes is {}".format(str(clf.best_estimator_)))

Best paramters for naive bayes is MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)


In [35]:
bayesian_classifier=clf.best_estimator_

In [36]:
%%time

### fitting final
bayesian_classifier.fit(x_train,y_train)

CPU times: user 34.2 ms, sys: 0 ns, total: 34.2 ms
Wall time: 73.4 ms


MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [39]:
predicted=bayesian_classifier.predict(x_test)

### Metrics

In [40]:
##Accuracy 
print ("Accuracy of Multinomial Naive bayes is {}".format(str(accuracy_score(y_test,predicted))))

Accuracy of Multinomial Naive bayes is 0.807909604519774


In [41]:
### confusion matrix
pd.crosstab(y_test, predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,Business & Economy,Education & Career,Entertainment,Food & Health,International,Others,Politics & Governance,Science & Technology,Sports
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Business & Economy,36,0,0,1,0,1,1,1,0
Education & Career,0,37,2,0,0,0,0,1,0
Entertainment,0,1,37,0,0,1,0,1,0
Food & Health,0,2,1,32,0,0,0,5,0
International,1,1,3,0,32,3,0,0,0
Others,1,4,5,1,3,17,6,3,0
Politics & Governance,2,1,0,0,1,3,27,0,0
Science & Technology,0,5,4,3,0,0,0,28,0
Sports,0,0,0,0,0,0,0,0,40


In [42]:
report = classification_report(y_test, predicted, output_dict=True)

In [43]:
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
Business & Economy,0.9,0.9,0.9,40.0
Education & Career,0.72549,0.925,0.813187,40.0
Entertainment,0.711538,0.925,0.804348,40.0
Food & Health,0.864865,0.8,0.831169,40.0
International,0.888889,0.8,0.842105,40.0
Others,0.68,0.425,0.523077,40.0
Politics & Governance,0.794118,0.794118,0.794118,34.0
Science & Technology,0.717949,0.7,0.708861,40.0
Sports,1.0,1.0,1.0,40.0
accuracy,0.80791,0.80791,0.80791,0.80791
