In [1]:
import pandas as pd
import nltk
import re

# Loading the dataset

In [2]:
df_train = pd.read_csv('./datasets/r8-train-all-terms.txt', header=None, sep='\t', names=['category', 'text'])
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,category,text
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [3]:
text_train = df_train.drop(['category'],axis=1)
y_train = df_train['category']

text_train, y_train

(                                                   text
 0     champion products ch approves stock split cham...
 1     computer terminal systems cpml completes sale ...
 2     cobanco inc cbco year net shr cts vs dlrs net ...
 3     am international inc am nd qtr jan oper shr lo...
 4     brown forman inc bfd th qtr net shr one dlr vs...
 ...                                                 ...
 5480  kelly oil and gas partners kly year dec shr ct...
 5481  japan seeks to strengthen paris currency accor...
 5482  tcw convertible securities cvt sets dividend t...
 5483  south korean won fixed at month high the bank ...
 5484  australian unions launch new south wales strik...
 
 [5485 rows x 1 columns],
 0           earn
 1            acq
 2           earn
 3           earn
 4           earn
           ...   
 5480        earn
 5481    money-fx
 5482        earn
 5483    money-fx
 5484        ship
 Name: category, Length: 5485, dtype: object)

In [4]:
y_train.value_counts()

earn        2840
acq         1596
crude        253
trade        251
money-fx     206
interest     190
ship         108
grain         41
Name: category, dtype: int64

In [5]:
df_test = pd.read_csv('datasets/r8-test-all-terms.txt',
                        header = None, 
                        sep='\t', 
                        names = ['category', 'text'])
text_test = df_test.drop(['category'],axis=1)
y_test = df_test['category']

y_test.value_counts()

earn        1083
acq          696
crude        121
money-fx      87
interest      81
trade         75
ship          36
grain         10
Name: category, dtype: int64

# Pre-processing the data

In [6]:
# Tokenizing, stop-words removal and lemmatization
from nltk.stem import WordNetLemmatizer

def text_preprocess(articles: pd.DataFrame) -> list:
    lemmatizer = WordNetLemmatizer()
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles['text'][i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [lemmatizer.lemmatize(word) for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [7]:
articles_train = text_train.copy()
articles_train.reset_index(inplace=True)
articles_train

Unnamed: 0,index,text
0,0,champion products ch approves stock split cham...
1,1,computer terminal systems cpml completes sale ...
2,2,cobanco inc cbco year net shr cts vs dlrs net ...
3,3,am international inc am nd qtr jan oper shr lo...
4,4,brown forman inc bfd th qtr net shr one dlr vs...
...,...,...
5480,5480,kelly oil and gas partners kly year dec shr ct...
5481,5481,japan seeks to strengthen paris currency accor...
5482,5482,tcw convertible securities cvt sets dividend t...
5483,5483,south korean won fixed at month high the bank ...


In [8]:
articles_test = text_test.copy()
articles_test.reset_index(inplace=True)
articles_test

Unnamed: 0,index,text
0,0,asian exporters fear damage from u s japan rif...
1,1,china daily says vermin eat pct grain stocks a...
2,2,australian foreign ship ban ends but nsw ports...
3,3,sumitomo bank aims at quick recovery from merg...
4,4,amatil proposes two for five bonus share issue...
...,...,...
2184,2184,balladur insists on maintenance of louvre acco...
2185,2185,philippine trade gap widens in january august ...
2186,2186,iran soviet union to swap crude refined produc...
2187,2187,n z s chase corp makes offer for entregrowth c...


In [9]:
X_train = text_preprocess(articles_train)
X_train[0]

'champion product ch approves stock split champion product inc said board director approved two one stock split common share shareholder record april company also said board voted recommend shareholder annual meeting april increase authorized capital stock five mln mln share reuter'

In [10]:
X_test = text_preprocess(articles_test)
X_test[0]



# TW-IDF Model

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from pprint import pprint

from gowpy.feature_extraction.gow import TwidfVectorizer
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [12]:
from sklearn.metrics import matthews_corrcoef, make_scorer
scorer_mcc = make_scorer(matthews_corrcoef)

### Hyperparameter tuning and cross-validation score

In [13]:
pipeline = Pipeline([
    ('gow', TwidfVectorizer()),
    ('svc', SVC()),
])

parameters = {
    'gow__directed' : [True, False],
    'gow__window_size' : [2, 4, 8, 16],
    'gow__b' : [0.0, 0.003],
    'gow__term_weighting' : ['degree', 'pagerank'],
    'gow__min_df' : [0, 5, 10],
    'gow__max_df' : [0.8, 0.9, 1.0],
#
    'svc__C':[1, 10, 100],
    'svc__kernel':['linear', 'rbf'],
    'svc__shrinking' : [True, False],
    'svc__probability' : [True, False],
    'svc__class_weight' : ['balanced'],
}

In [61]:
random_search = HalvingRandomSearchCV(pipeline, 
                           parameters, 
                           cv=10,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10,
                           min_resources='smallest',
                           factor=3)

print("Performing random search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

random_search.fit(X_train, y_train)

print("Best score: %0.3f" % random_search.best_score_)
print("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing random search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6, 8],
 'svc__C': [0.1, 1, 10, 100, 1000],
 'svc__class_weight': [None, 'balanced'],
 'svc__degree': [0, 1, 2, 3, 4, 5],
 'svc__gamma': ['scale', 'auto'],
 'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 160
max_resources_: 5485
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 34
n_resources: 160
Fitting 10 folds for each of 34 candidates, totalling 340 fits
----------
iter: 1
n_candidates: 12
n_resources: 480
Fitting 10 folds for each of 12 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 4
n_resources: 1440
Fitting 10 folds for each of 4 candida

In [71]:
grid_search = HalvingGridSearchCV(pipeline, 
                           parameters, 
                           cv=3,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6],
 'svc__C': [1, 10, 100],
 'svc__class_weight': ['balanced'],
 'svc__kernel': ['linear', 'rbf'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 5
n_required_iterations: 9
n_possible_iterations: 5
min_resources_: 48
max_resources_: 5485
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6912
n_resources: 48
Fitting 3 folds for each of 6912 candidates, totalling 20736 fits
----------
iter: 1
n_candidates: 2304
n_resources: 144
Fitting 3 folds for each of 2304 candidates, totalling 6912 fits
----------
iter: 2
n_candidates: 768
n_resources: 432
Fitting 3 folds for each of 768 candidates, totalling 2304 fits
----------
iter: 3
n_candidates: 256
n_resources: 1296
Fitting 3 folds for ea

### Fitting the TW-IDF model and evaluating on test data

In [14]:
# Manually setting best parameters
pipeline_gow = Pipeline([
    ('gow', TwidfVectorizer(b=0.003, directed=False, max_df=1.0, min_df=5, term_weighting='degree', window_size=4)),
    ('svc', SVC(C=10, gamma='scale', kernel='linear', class_weight='balanced', probability=True, shrinking=False)),
])

pipeline_gow.fit(X_train, y_train)

Pipeline(steps=[('gow',
                 TwidfVectorizer(b=0.003, directed=False, min_df=5,
                                 tokenizer=<function default_tokenizer at 0x000001E517C0ACA0>)),
                ('svc',
                 SVC(C=10, class_weight='balanced', kernel='linear',
                     probability=True, shrinking=False))])

In [15]:
# Evaluation on test data
y_pred = pipeline_gow.predict(X_test)

print(classification_report(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))
print('MCC: ', matthews_corrcoef(y_test, y_pred))
print('f1-micro: ', f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

         acq       0.97      0.98      0.97       696
       crude       0.96      0.94      0.95       121
        earn       0.99      0.99      0.99      1083
       grain       1.00      0.90      0.95        10
    interest       0.92      0.85      0.88        81
    money-fx       0.88      0.83      0.85        87
        ship       0.94      0.83      0.88        36
       trade       0.93      0.99      0.95        75

    accuracy                           0.97      2189
   macro avg       0.95      0.91      0.93      2189
weighted avg       0.97      0.97      0.97      2189


Accuracy:  0.9725902238465053
MCC:  0.9575397160143421
f1-micro:  0.9725902238465053
