# TF-IDF

IMPORTING LIBRARIES

In [10]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV , train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , confusion_matrix

In [2]:
df = pd.read_csv('/Users/sarthaksharna/Text_Classification/data/cleaned_bbc_text')

df.head()

Unnamed: 0,category,text
0,4,tv future hand viewer home theatre system plas...
1,0,worldcom bos leave book alone former worldcom ...
2,3,tiger wary farrell gamble leicester say rush m...
3,3,yeading face newcastle fa cup premiership side...
4,1,ocean twelve raid box office ocean twelve crim...


TRAIN - TEST SPLIT

In [3]:
X = df['text']

y = df['category']

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

X_train.shape , X_test.shape , y_train.shape , y_test.shape

((1694,), (424,), (1694,), (424,))

TRAINING MODELS IN PIPELINE

In [4]:
models = {

    'Multinomial Naive Bayes' : MultinomialNB() , 
    'Logistic Regression' : LogisticRegression() ,
    'Support Vector Machine' : SVC() ,
    'Random Forest' : RandomForestClassifier()
    
}

In [9]:
for model_name , clf in models.items() :

    pipe = Pipeline(
        [
            ('tfidf' , TfidfVectorizer(ngram_range = (1 , 2))) , 
            ('classifier' , clf)
        ]
    )

    pipe.fit(X_train , y_train)

    y_pred_train = pipe.predict(X_train)
    y_pred_test = pipe.predict(X_test)

    print(f'Model : {model_name} \n ')

    print(f'Train Accuracy : {accuracy_score(y_train , y_pred_train)}')
    print(f'Train Precision : {precision_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train Recall : {recall_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train F1-Score : {f1_score(y_train , y_pred_train , average = "macro")}')

    print('\n')

    
    print(f'Test Accuracy : {accuracy_score(y_test , y_pred_test)}')
    print(f'Test Precision : {precision_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test Recall : {recall_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test F1-Score : {f1_score(y_test , y_pred_test , average = "macro")}' , '\n')

    print(f'Confusion Matrix : \n {confusion_matrix(y_test , y_pred_test)}')

    print('\n')

    print('=='*50)

    print('\n')

Model : Multinomial Naive Bayes 
 
Train Accuracy : 0.9952774498229043
Train Precision : 0.9956289139061308
Train Recall : 0.9946420251554065
Train F1-Score : 0.9951202681062995


Test Accuracy : 0.9575471698113207
Test Precision : 0.960733851162894
Test Recall : 0.9543552804721189
Test F1-Score : 0.9566028620157347 

Confusion Matrix : 
 [[95  0  2  0  0]
 [ 1 76  3  1  0]
 [ 4  0 74  0  0]
 [ 0  0  0 94  0]
 [ 1  0  2  4 67]]




Model : Logistic Regression 
 
Train Accuracy : 0.9982290436835891
Train Precision : 0.9979463052054076
Train Recall : 0.9981457358904606
Train F1-Score : 0.9980451171527548


Test Accuracy : 0.964622641509434
Test Precision : 0.9662161423914
Test Recall : 0.9630092688511933
Test F1-Score : 0.9643583268913443 

Confusion Matrix : 
 [[94  0  3  0  0]
 [ 1 77  2  1  0]
 [ 4  0 73  0  1]
 [ 0  0  0 94  0]
 [ 1  0  0  2 71]]




Model : Support Vector Machine 
 
Train Accuracy : 1.0
Train Precision : 1.0
Train Recall : 1.0
Train F1-Score : 1.0


Test Accuracy : 

PIPELINE FOR HYPERPARAMETER TUNING 

In [11]:
pipelines = {

    'SVC': Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
        ('classifier', SVC())
    ]),

    'RandomForestClassifier' : Pipeline([
        ('tfidf' , TfidfVectorizer(ngram_range = (1 , 2))) ,
        ('classifier' , RandomForestClassifier())
    ]),

    'LogisticRegression' : Pipeline([
        ('tfidf' , TfidfVectorizer(ngram_range = (1 , 2))) ,
        ('classifier' , LogisticRegression())
    ])

}


param_grid = {
    'SVC': {
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf']
    },

    'RandomForestClassifier': {
        'classifier__n_estimators': [10, 100, 1000],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    },

    'LogisticRegression': {
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2']
    }
}


In [13]:
results = []


for name in pipelines.keys() :
    print(f'Tuning {name} .... !! ')

    randomcv = RandomizedSearchCV(
        estimator = pipelines[name],
        param_distributions = param_grid[name] , 
        n_iter = 10 ,
        cv = 5 , 
        scoring = 'accuracy' ,
        n_jobs = -1
    )

    randomcv.fit(X_train , y_train)

    y_pred_tuned = randomcv.predict(X_test)

    results.append(
        {
        'Model': name,
        'Best Params': randomcv.best_params_,
        'Best CV Score': randomcv.best_score_,
        'Test Accuracy': accuracy_score(y_test, y_pred_tuned),
        'Test Precision': precision_score(y_test, y_pred_tuned, average='weighted'),
        'Test Recall': recall_score(y_test, y_pred_tuned, average='weighted'),
        'Test F1 Score': f1_score(y_test, y_pred_tuned, average='weighted')
        }
    )

    print(f" Best params for {name}: {randomcv.best_params_}")
    
    print(f" Test Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}")

    print('\n')
    print("-" * 60)


Tuning SVC .... !! 
 Best params for SVC: {'classifier__kernel': 'linear', 'classifier__C': 10}
 Test Accuracy: 0.9835


------------------------------------------------------------
Tuning RandomForestClassifier .... !! 
 Best params for RandomForestClassifier: {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 2, 'classifier__max_depth': None}
 Test Accuracy: 0.9623


------------------------------------------------------------
Tuning LogisticRegression .... !! 


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Library

 Best params for LogisticRegression: {'classifier__penalty': 'l2', 'classifier__C': 100}
 Test Accuracy: 0.9764


------------------------------------------------------------


RESULTS AS A DATAFRAME

In [14]:
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Best Params,Best CV Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,SVC,"{'classifier__kernel': 'linear', 'classifier__...",0.976984,0.983491,0.983522,0.983491,0.983491
1,RandomForestClassifier,"{'classifier__n_estimators': 1000, 'classifier...",0.944518,0.962264,0.964292,0.962264,0.962247
2,LogisticRegression,"{'classifier__penalty': 'l2', 'classifier__C':...",0.976988,0.976415,0.976482,0.976415,0.976435


THEREFORE BEST MODEL :

--> SVC

<!-- -- -->

FINAL PIPELINE WITH BEST MODEL

In [17]:
# Build final pipeline

final_pipeline = Pipeline([
    
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),

    ('classifier', SVC(C = 10 , kernel = 'linear'))
       
    
])

# Fit on training data
final_pipeline.fit(X_train, y_train)

In [19]:
import joblib
joblib.dump(final_pipeline, 'tfidf_svc_bbc_classifier.pkl')

['tfidf_svc_bbc_classifier.pkl']