In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import  CountVectorizer,TfidfTransformer,TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import svm
import time 
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)


In [2]:
train_dataframe = pd.read_csv('../Data/training.csv')
test_dataframe = pd.read_csv('../Data/testing.csv')

### Steps followed
Firstly, we will clean our data then we need to convert our cleaned text data into TFIDF vectors and then we will use those TFIDF vector as input to classifier. We will search best hyperparameters for our model. 

In [3]:
train_dataframe=train_dataframe[['Category','Article']]
test_dataframe=test_dataframe[['Category','Article']]

In [4]:
train_dataframe.head(4)

Unnamed: 0,Category,Article
0,Entertainment,"A dash of stand up comedy, lots of improvisat..."
1,Entertainment,Actor Tamannaah Bhatia is excited to have sig...
2,Entertainment,They say you never forget your first. Tri-city...
3,Entertainment,It was in the year 1968 when the Beatles were...


### Cleaning Text

In [5]:
lemmatizer=WordNetLemmatizer() #For words Lemmatization
stop_words=set(stopwords.words('english'))

In [7]:
def TokenizeText(text):
    ''' 
     Tokenizes text by removing various stopwords and lemmatizing them
    '''
    text=str(text)
    text=text.lower()
    text=re.sub('[^A-Za-z0-9\s]+', '', text)
    word_list=word_tokenize(text)
    word_list_final=[]
    for word in word_list:
        if word not in stop_words:
            word_list_final.append(lemmatizer.lemmatize(word))
    return " ".join(word_list_final)

In [8]:
%%time
train_dataframe.Article = train_dataframe.Article.apply(TokenizeText)

CPU times: user 6.71 s, sys: 68.7 ms, total: 6.78 s
Wall time: 7.18 s


In [9]:
test_dataframe.Article = test_dataframe.Article.apply(TokenizeText)

In [10]:
train_dataframe.head(2)

Unnamed: 0,Category,Article
0,Entertainment,dash stand comedy lot improvisation ton audien...
1,Entertainment,actor tamannaah bhatia excited signed acclaime...


## Finding optimal TFIDF paramters

In [11]:
temp_tf= TfidfVectorizer()
temp_tf.fit(train_dataframe.Article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
len(temp_tf.vocabulary_)

37107

### We will now  find best parameeters for SVM

In [16]:
tfidfvectorizer=TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=40000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)

In [58]:
## we will combine train and test data to obtain merged Articles. Then we would fit above found TFIDf on given articles. 

In [17]:
tfidfvectorizer.fit(pd.concat([train_dataframe,test_dataframe])['Article'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=40000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [18]:
x_train=tfidfvectorizer.transform(train_dataframe['Article'].values)
x_test=tfidfvectorizer.transform(test_dataframe['Article'].values)
y_train=train_dataframe['Category'].values
y_test=test_dataframe['Category'].values

In [19]:
x_train.shape,x_test.shape

((1435, 40000), (354, 40000))

In [20]:
%%time
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rfc=RandomForestClassifier(random_state=42)

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, verbose=1,n_jobs=-1,scoring='accuracy')
CV_rfc.fit(x_train, y_train)


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  2.8min finished


CPU times: user 1.46 s, sys: 84 ms, total: 1.54 s
Wall time: 2min 46s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid

In [21]:
CV_rfc.best_score_

0.7588850174216027

In [22]:
print ("Best paramters for Random Forest is {}".format(str(CV_rfc.best_estimator_)))

Best paramters for Random Forest is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [23]:
rf_classifier=CV_rfc.best_estimator_

In [24]:
%%time

### fitting final
rf_classifier.fit(x_train,y_train)

CPU times: user 820 ms, sys: 4.26 ms, total: 824 ms
Wall time: 847 ms


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [25]:
predicted=rf_classifier.predict(x_test)

### Metrics

In [26]:
##Accuracy 
print ("Accuracy of random forest is {}".format(str(accuracy_score(y_test,predicted))))

Accuracy of random forest is 0.768361581920904


In [27]:
### confusion matrix
pd.crosstab(y_test, predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,Business & Economy,Education & Career,Entertainment,Food & Health,International,Others,Politics & Governance,Science & Technology,Sports
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Business & Economy,39,0,0,1,0,0,0,0,0
Education & Career,1,35,2,0,0,0,0,2,0
Entertainment,2,2,36,0,0,0,0,0,0
Food & Health,1,2,0,31,0,0,0,5,1
International,4,1,1,0,31,2,0,0,1
Others,8,2,1,4,1,14,7,1,2
Politics & Governance,5,0,0,0,4,1,24,0,0
Science & Technology,4,7,5,2,0,0,0,22,0
Sports,0,0,0,0,0,0,0,0,40


In [28]:
report = classification_report(y_test, predicted, output_dict=True)

In [29]:
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
Business & Economy,0.609375,0.975,0.75,40.0
Education & Career,0.714286,0.875,0.786517,40.0
Entertainment,0.8,0.9,0.847059,40.0
Food & Health,0.815789,0.775,0.794872,40.0
International,0.861111,0.775,0.815789,40.0
Others,0.823529,0.35,0.491228,40.0
Politics & Governance,0.774194,0.705882,0.738462,34.0
Science & Technology,0.733333,0.55,0.628571,40.0
Sports,0.909091,1.0,0.952381,40.0
accuracy,0.768362,0.768362,0.768362,0.768362
