In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import  CountVectorizer,TfidfTransformer,TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import svm
import time 
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report

In [2]:
train_dataframe = pd.read_csv('../Data/training.csv')
test_dataframe = pd.read_csv('../Data/testing.csv')

### Steps followed
Firstly, we will clean our data then we need to convert our cleaned text data into TFIDF vectors and then we will use those TFIDF vector as input to classifier. We will first find optimal parameters for TFIDF and then using those paramaters, we will find best hyperparameters for our model. 

In [3]:
train_dataframe=train_dataframe[['Category','Article']]
test_dataframe=test_dataframe[['Category','Article']]

In [4]:
train_dataframe.head(4)

Unnamed: 0,Category,Article
0,Entertainment,"A dash of stand up comedy, lots of improvisat..."
1,Entertainment,Actor Tamannaah Bhatia is excited to have sig...
2,Entertainment,They say you never forget your first. Tri-city...
3,Entertainment,It was in the year 1968 when the Beatles were...


### Cleaning Text

In [5]:
lemmatizer=WordNetLemmatizer() #For words Lemmatization
stop_words=set(stopwords.words('english'))

In [21]:
def TokenizeText(text):
    ''' 
     Tokenizes text by removing various stopwords and lemmatizing them
    '''
    text=str(text)
    text=text.lower()
    text=re.sub('[^A-Za-z0-9\s]+', '', text)
    word_list=word_tokenize(text)
    word_list_final=[]
    for word in word_list:
        if word not in stop_words:
            word_list_final.append(lemmatizer.lemmatize(word))
    return " ".join(word_list_final)

In [22]:
%%time
train_dataframe.Article = train_dataframe.Article.apply(TokenizeText)

CPU times: user 4.36 s, sys: 1.91 ms, total: 4.36 s
Wall time: 4.43 s


In [23]:
test_dataframe.Article = test_dataframe.Article.apply(TokenizeText)

In [24]:
train_dataframe.head(2)

Unnamed: 0,Category,Article
0,Entertainment,dash stand comedy lot improvisation ton audien...
1,Entertainment,actor tamannaah bhatia excited signed acclaime...


## Finding optimal TFIDF paramters

In [34]:
temp_tf= TfidfVectorizer()
temp_tf.fit(train_dataframe.Article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [36]:
len(temp_tf.vocabulary_)

37107

In [37]:

pipe = Pipeline([
...     ('vect', TfidfVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', svm.SVC()),
... ])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (1000, 10000, 20000, 40000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
}

In [50]:
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)

In [52]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe.steps])
print("parameters: {}".format(parameters))
t0 = time.time()
grid_search.fit(train_dataframe.Article,train_dataframe.Category)

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (1000, 10000, 20000, 40000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False)}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  6.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                     

In [53]:
grid_search.best_score_

0.6376306620209059

In [54]:
grid_search.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=40000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
     

### Now, we have found the best parameters for the TFIDF. We will now  find best parameeters for SVM

In [57]:
tfidfvectorizer=grid_search.best_estimator_[0]

In [58]:
## we will combine train and test data to obtain merged Articles. Then we would fit above found TFIDf on given articles. 

In [62]:
tfidfvectorizer.fit(pd.concat([train_dataframe,test_dataframe])['Article'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=40000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [67]:
x_train=tfidfvectorizer.transform(train_dataframe['Article'].values)
x_test=tfidfvectorizer.transform(test_dataframe['Article'].values)
y_train=train_dataframe['Category'].values
y_test=test_dataframe['Category'].values

In [69]:
x_train.shape,x_test.shape

((1435, 40000), (354, 40000))

In [72]:
%%time
parameters = {'kernel':('linear', 'rbf'), 'C':[1,3,5,10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters,verbose=1,n_jobs=-1,scoring='accuracy')
clf.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   46.7s finished


CPU times: user 4.58 s, sys: 40.6 ms, total: 4.62 s
Wall time: 51.2 s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1, 3, 5, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [73]:
clf.best_score_

0.8271777003484321

In [77]:
print ("Best paramters for SVM is {}".format(str(clf.best_estimator_)))

Best paramters for SVM is SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [75]:
svm_classifier=clf.best_estimator_

In [78]:
%%time

### fitting final
svm_classifier.fit(x_train,y_train)

CPU times: user 4.53 s, sys: 8.11 ms, total: 4.54 s
Wall time: 4.6 s


SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [94]:
predicted=svm_classifier.predict(x_test)

### Metrics

In [101]:
##Accuracy 
print ("Accuracy of svm is {}".format(str(accuracy_score(y_test,predicted))))

Accuracy of svm is 0.8135593220338984


In [102]:
### confusion matrix
pd.crosstab(y_test, predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,Business & Economy,Education & Career,Entertainment,Food & Health,International,Others,Politics & Governance,Science & Technology,Sports
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Business & Economy,34,0,0,1,0,1,1,3,0
Education & Career,0,34,2,0,0,1,0,3,0
Entertainment,0,1,36,0,0,2,0,1,0
Food & Health,1,1,0,28,0,4,0,5,1
International,2,0,2,0,33,2,0,1,0
Others,1,0,2,0,3,25,4,5,0
Politics & Governance,1,0,0,0,2,6,25,0,0
Science & Technology,0,3,1,2,1,0,0,33,0
Sports,0,0,0,0,0,0,0,0,40


In [96]:
report = classification_report(y_test, predicted, output_dict=True)

In [99]:
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
Business & Economy,0.871795,0.85,0.860759,40.0
Education & Career,0.871795,0.85,0.860759,40.0
Entertainment,0.837209,0.9,0.86747,40.0
Food & Health,0.903226,0.7,0.788732,40.0
International,0.846154,0.825,0.835443,40.0
Others,0.609756,0.625,0.617284,40.0
Politics & Governance,0.833333,0.735294,0.78125,34.0
Science & Technology,0.647059,0.825,0.725275,40.0
Sports,0.97561,1.0,0.987654,40.0
accuracy,0.813559,0.813559,0.813559,0.813559
