# Bag Of Words

IMPORTING LIBRARIES

In [11]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , roc_auc_score , confusion_matrix 

* 0 -> business

* 1 -> entertainment

* 2 -> politics

* 3 -> sport

* 4 -> tech


In [2]:
df = pd.read_csv('/Users/sarthaksharna/Text_Classification/data/cleaned_bbc_text')

df.head()

Unnamed: 0,category,text
0,4,tv future hand viewer home theatre system plas...
1,0,worldcom bos leave book alone former worldcom ...
2,3,tiger wary farrell gamble leicester say rush m...
3,3,yeading face newcastle fa cup premiership side...
4,1,ocean twelve raid box office ocean twelve crim...


TRAIN TEST SPLIT

In [3]:
X = df['text']

y = df['category']

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

In [4]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((1694,), (424,), (1694,), (424,))

TRAINING MODELS IN A PIPELINE

In [5]:
models = {
    'MultinomialNB' : MultinomialNB(),
    'LogisticRegression' : LogisticRegression(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier()
}

In [8]:
for model_name , clf in models.items() :
    pipe = Pipeline(
        [
            ('bow' , CountVectorizer(ngram_range = (1,2))),
            ('classifier' , clf)
        ]
    )


    pipe.fit(X_train , y_train)

    y_pred_train = pipe.predict(X_train)
    y_pred_test = pipe.predict(X_test)

    print(f'{model_name} \n')

    print(f'Train Accuracy : {accuracy_score(y_train , y_pred_train)}')
    print(f'Train Precision : {precision_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train Recall : {recall_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train F1-Score : {f1_score(y_train , y_pred_train , average = "macro")}')

    
    print('\n')

    print(f'Test Accuracy : {accuracy_score(y_test , y_pred_test)}')
    print(f'Test Precision : {precision_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test Recall : {recall_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test F1-Score : {f1_score(y_test , y_pred_test , average = "macro")}' , '\n')

    print(f'Confusion Matrix : \n {confusion_matrix(y_test , y_pred_test)}')

    print('\n')
    print('=' * 50)
    print('\n')


        
    

MultinomialNB 

Train Accuracy : 0.999409681227863
Train Precision : 0.9993865030674847
Train Recall : 0.9995073891625615
Train F1-Score : 0.999446171213404


Test Accuracy : 0.9764150943396226
Test Precision : 0.9760236916310256
Test Recall : 0.9772500758755054
Test F1-Score : 0.9762445470882426 

Confusion Matrix : 
 [[92  0  4  0  1]
 [ 1 77  2  0  1]
 [ 1  0 77  0  0]
 [ 0  0  0 94  0]
 [ 0  0  0  0 74]]




LogisticRegression 

Train Accuracy : 1.0
Train Precision : 1.0
Train Recall : 1.0
Train F1-Score : 1.0


Test Accuracy : 0.9693396226415094
Test Precision : 0.9693971428925761
Test Recall : 0.9687269876273314
Test F1-Score : 0.9689767654384388 

Confusion Matrix : 
 [[93  0  4  0  0]
 [ 1 79  0  1  0]
 [ 2  2 72  1  1]
 [ 0  0  0 94  0]
 [ 0  0  0  1 73]]




SVC 

Train Accuracy : 0.9988193624557261
Train Precision : 0.9988944193061841
Train Recall : 0.9988920045471769
Train F1-Score : 0.9988926074870716


Test Accuracy : 0.9551886792452831
Test Precision : 0.9546125273327052

HYPERPARAMETER TUNING THE MODELS

In [None]:
pipelines = {
    'SVC': Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 2))),
        ('classifier', SVC())
    ]),

    'LogisticRegression': Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 2))),
        ('classifier', LogisticRegression())

    ]),

    'RandomForestClassifier' : Pipeline([
        ('bow' , CountVectorizer(ngram_range = (1,2))),
        ('classifier' , RandomForestClassifier())
    ])


}



# HYPERPARAMETERS

param_grids = {
    'SVC': {
        'classifier__C' :  [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__kernel' : ['linear', 'rbf']
    },

    'LogisticRegression' : {
        'classifier__C' : [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty' : ['l1' , 'l2' , 'elasticnet']

    },

    'RandomForestClassifier' : {
        'classifier__n_estimators' : [100 , 150 , 200],
        'classifier__max_depth' : [None , 10 , 20 , 30 , 40]
    }
}


# Storing results 

results = []

for name in pipelines.keys() :
    print(f'Tuning {name} .... !! ')

    randomcv = RandomizedSearchCV(
        estimator = pipelines[name],
        param_distributions = param_grids[name] , 
        n_iter = 10 ,
        cv = 5 , 
        scoring = 'accuracy' ,
        n_jobs = -1
    )

    randomcv.fit(X_train , y_train)

    y_pred_tuned = randomcv.predict(X_test)

    results.append(
        {
        'Model': name,
        'Best Params': randomcv.best_params_,
        'Best CV Score': randomcv.best_score_,
        'Test Accuracy': accuracy_score(y_test, y_pred_tuned),
        'Test Precision': precision_score(y_test, y_pred_tuned, average='weighted'),
        'Test Recall': recall_score(y_test, y_pred_tuned, average='weighted'),
        'Test F1 Score': f1_score(y_test, y_pred_tuned, average='weighted')
        }
    )

    print(f" Best params for {name}: {randomcv.best_params_}")
    
    print(f" Test Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}")

    print('\n')
    print("-" * 60)

STORING THE TUNED RESULTS IN A DATAFRAME

In [13]:
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Best Params,Best CV Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,SVC,"{'classifier__kernel': 'linear', 'classifier__...",0.963994,0.974057,0.973907,0.974057,0.973928
1,LogisticRegression,"{'classifier__penalty': 'l2', 'classifier__C':...",0.966942,0.974057,0.973959,0.974057,0.973994
2,RandomForestClassifier,"{'classifier__n_estimators': 150, 'classifier_...",0.945107,0.957547,0.959931,0.957547,0.957647


THEREFORE BEST ACCURACY USING BOW :

--> MULTINOMIAL NAIVE BAYES