In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.externals import joblib
import os  
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [185]:
!pip install cleantext

Collecting cleantext
  Downloading cleantext-1.1.3-py3-none-any.whl (3.7 kB)
Installing collected packages: cleantext
Successfully installed cleantext-1.1.3


# Fetching Data

In [100]:
# Initialize data
data = pd.read_csv('/content/drive/MyDrive/DocClass/new_extracted_Data.csv', encoding = 'latin-1')
data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,label
0,0,0.0,âeraax frat\n\nâINCOME TAX DEPARTMENTâ\n...,PAN
1,1,1.0,\n\nstaat | SAT HIN UHI\n\nââINCOMETAK DE...,PAN
2,2,2.0,ae -\n\nyh Â«idee et ly Howes\n\nCe ee ee\n\n:...,PAN
3,3,3.0,\n\nâ\n,PAN
4,4,4.0,| INCONETAX DEPARTMENT Â«A! GOVT. OF INDIAâ)...,PAN
...,...,...,...,...
811,381,,Transport Department Government of NCT of Delh...,DL
812,382,,\n\nnada)\na/oa01)\n\n \n\nPIRSA\n\n \n\n...,DL
813,383,,4 ladia Driving Licence | swidoadas\nz (9nK/20...,DL
814,384,,\n\n \n\nIssued On: 00872016\n\npets\n\nRTAHY...,DL


In [101]:
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [102]:
import string
# import cleantext

def clean_text(text):
    text = text.lower()
    # text = re.sub('\[.*?\]', '', text)
    # text = re.sub('https?://\S+|www\.\S+', '', text)
    # text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # text = re.sub('\n', '', text)
    text = re.sub("[^a-zA-Z0-9]+", " ", text)
    
    # sent = "Io andiamo to the beach with my amico."
    # text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    text = text.split()
    text = [ps.stem(s) for s in text if s not in sw] # list comprehension
    text = " ".join(text)
    
    return text

In [103]:
# data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'],inplace=True)
data['text']=data['text'].apply(clean_text)
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,label
0,0,0.0,eraax frat incom tax depart sangaraj patilfren...,PAN
1,1,1.0,staat sat hin uhi incometak departn govt india...,PAN
2,2,2.0,ae yh ide et ly howesc ee ee seiabna awigastro...,PAN
3,3,3.0,,PAN
4,4,4.0,inconetax depart govt india vt r soniva wy br ...,PAN


In [104]:
(data['text'] == '').value_counts()

False    743
True      73
Name: text, dtype: int64

In [105]:
index_names = data[data['text'] == ''].index
data.drop(index_names, inplace = True)

In [106]:
(data['text'] == '').value_counts()

False    743
Name: text, dtype: int64

In [107]:
data['text'].isna().value_counts()

False    743
Name: text, dtype: int64

In [108]:
df = data.copy()

# Utility

In [109]:
def get_transformed_train_test_split(data, column_transformer, target_col = "label"):
  y = data.loc[:, target_col].values
  xtrain_, xtest_, ytrain, ytest = train_test_split(data['text'], y,random_state=42, test_size=0.2,stratify=y )
  xtrain = xtrain_.array.to_numpy().reshape(-1,1)
  xtest = xtest_.array.to_numpy().reshape(-1,1)
  xtrain = column_transformer.fit_transform(xtrain)
  xtest = column_transformer.transform(xtest)
  return xtrain_, xtest_, xtrain, xtest, ytrain, ytest

In [110]:
def get_classifier(classifier_instance,xtrain, ytrain, param_dict={}):
  classifier = classifier_instance(**param_dict)
  classifier.fit(xtrain, ytrain)
  return classifier

def evaluate_classifier(classifier, x, y):
  pred = classifier.predict(x)
  print('*'*20, 'Confusion Matrix', '*'*20)
  print(metrics.confusion_matrix(y, pred))
  print('*'*20, 'Classification Report', '*'*20)
  print(metrics.classification_report(y, pred))
  return pred

def evaluate_classifier_on_train_test(classifier, xtrain, ytrain, xtest, ytest):
  print('Evaluation on Train Data','-'*10, '>')
  preds_train = evaluate_classifier(classifier, xtrain, ytrain)
  print('Evaluation on Test Data','-'*10, '>')
  preds_test = evaluate_classifier(classifier, xtest, ytest)
  return preds_train, preds_test

def get_best_classifier(classifier_instance, hypertuner, xtrain, ytrain, classifier_param_dict={}, distributions={}, cv= 6, verbose=True):
  classifier = classifier_instance(**classifier_param_dict)
  tuned_classifier = hypertuner(classifier, distributions,cv=cv,n_jobs=-1,verbose=verbose)
  tuned_classifier.fit(xtrain, ytrain)
  return tuned_classifier

In [111]:
def predict(classifier, x):
  pred = classifier.predict(x)
  return pred

In [112]:
def save_model_and_predictions(dir_name, save_model=True, info_file=None, save_test_pred = True, save_train_pred=True, classifier=None,
                               x_train=None, x_test=None, y_train=None, y_test=None, y_pred_train=None, y_pred_test=None):
  try:
    os.makedirs(dir_name, exist_ok = True)
    print("Directory created successfully")
  except OSError as error:
    print("Directory already exists")
  if info_file:
    file = open(os.path.join(dir_name,"info.txt"), "w+") 
    file.write(info_file) 
    file.close() 
  if save_model:
    # Save the model as a pickle in a file
    joblib.dump(classifier, os.path.join(dir_name,'model.pkl'))
  if save_test_pred:
    df_test = pd.DataFrame({'text': x_test, 'original': y_test, 'predicted': y_pred_test})
    df_test.to_csv(os.path.join(dir_name, 'test.csv'))
  if save_train_pred:
    df_train = pd.DataFrame({'text': x_train, 'original': y_train, 'predicted': y_pred_train})
    df_train.to_csv(os.path.join(dir_name, 'train.csv'))

In [113]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
lb.fit(df['label'])
df['label'] = lb.transform(df['label'])
# save the instance of encoded labels
from sklearn.externals import joblib
 
# Save the model as a pickle in a file
joblib.dump(lb, './class_label_encoder.pkl')

['./class_label_encoder.pkl']

#Bert Embeddings

In [102]:
!pip install flair

Collecting flair
  Downloading flair-0.8.0.post1-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 5.3 MB/s 
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-1.7.0.tar.gz (28 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting torch<=1.7.1,>=1.5.0
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K
[?25hTraceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/cli/base_command.py", line 180, in _main
    status = self.run(options, args)
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/cli/req_command.py", line 199, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/commands/install.py", line 319, in run
    reqs, check_supported_wheels=not options.target_dir
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 128, in resolve
    requirement

In [34]:
import torch
import numpy as np
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.base import BaseEstimator, TransformerMixin

In [35]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel
from more_itertools import chunked

In [36]:
class FlairTransformerEmbedding(TransformerMixin, BaseEstimator):

    def __init__(self, model_name='bert-base-uncased', batch_size=None, layers=None):
        # From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
        # For pickling reason you should not load models in __init__
        self.model_name = model_name
        self.model_kw_args = {'batch_size': batch_size, 'layers': layers}
        self.model_kw_args = {k: v for k, v in self.model_kw_args.items()
                              if v is not None}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        model = TransformerDocumentEmbeddings(
                self.model_name, fine_tune=False,
                **self.model_kw_args)

        sentences = [Sentence(text) for text in X]
        embedded = model.embed(sentences)
        embedded = [e.get_embedding().reshape(1, -1) for e in embedded]
        return np.array(torch.cat(embedded).cpu())

In [37]:
class TransformerEmbedding(TransformerMixin, BaseEstimator):

    def __init__(self, model_name='bert-base-uncased', batch_size=1, layer=-1):
        # From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
        # For pickling reason you should not load models in __init__
        self.model_name = model_name
        self.layer = layer
        self.batch_size = batch_size
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModel.from_pretrained(self.model_name)

        res = []
        for batch in chunked(X, self.batch_size):
            encoded_input = tokenizer.batch_encode_plus(
                batch, return_tensors='pt', padding=True, truncation=True)
            output = model(**encoded_input)
            embed = output.last_hidden_state[:,self.layer].detach().numpy()
            res.append(embed)

        return np.concatenate(res)


# Column Transformer

In [114]:
# column_trans = ColumnTransformer([
#     ('embedding', FlairTransformerEmbedding(), 'text') #column name if column given
# ])
# bert_trans = ColumnTransformer([
#     ('embedding', FlairTransformerEmbedding(), 0) #column name if column given
# ])
array_trans = ColumnTransformer(transformers=[
    ('tfidf', TfidfVectorizer(), 0)
])

In [115]:
filename = 'tfidf.sav'
joblib.dump(array_trans, filename)

['tfidf.sav']

In [116]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,label
0,0,0.0,eraax frat incom tax depart sangaraj patilfren...,4
1,1,1.0,staat sat hin uhi incometak departn govt india...,4
2,2,2.0,ae yh ide et ly howesc ee ee seiabna awigastro...,4
4,4,4.0,inconetax depart govt india vt r soniva wy br ...,4
5,5,5.0,74 peo et er jaan eefo ae ein ral oy lpf wae c...,4
...,...,...,...,...
810,380,,48 06 2013 17 06 2033 nt 006 2013 nitinkumar p...,0
811,381,,transport depart govern nct delhilic drive veh...,0
812,382,,nada oa01 pirsa od ke prelbk oo gx origin verifi,0
813,383,,4 ladia drive licenc swidoadasz 9nk 2010 2 vot...,0


# Train-test vectorized split

In [117]:
xtrain_tfidf_orig, xtest_tfidf_orig, xtrain_tfidf, xtest_tfidf, ytrain_tfidf, ytest_tfidf = get_transformed_train_test_split(data, array_trans)

In [22]:
xtrain_orig, xtest_orig,xtrain, xtest, ytrain, ytest = get_transformed_train_test_split(data, bert_trans)



Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: ignored

# Simple Logistic (87% accuracy)
 * date ->0.83
 * no offer ->0.89
 * offer ->0.96
 * price ->0.77
 * relevant info ->0.66

In [118]:
lr_classifier = get_classifier(LogisticRegression, xtrain_tfidf, ytrain_tfidf, {'random_state':13, 'verbose':1})
pred_train, pred_test = evaluate_classifier_on_train_test(lr_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  1   2   0  16   0   0   0   0   0]
 [  0 160   0   0   0   0   0   0   0]
 [  0   3   0   9   0   0   0   0   0]
 [  0   8   0 151   0   0   0   0   0]
 [  0   2   0   0  52   0   0   0   0]
 [  0   2   0   0   0  44   0   0   0]
 [  0   6   0   5   0   0  11   0   0]
 [  0   0   0   0   0   0   0  92   0]
 [  0   2   0  10   0   0   0   0  18]]
******************** Classification Report ********************
                  precision    recall  f1-score   support

              DL       1.00      0.05      0.10        19
        Invoices       0.86      1.00      0.93       160
       Member_Id       0.00      0.00      0.00        12
          Others       0.79      0.95      0.86       159
             PAN       1.00      0.96      0.98        54
      aadharcard       1.00      0.96      0.98        46
 cancelledcheque       1.00      0.50      0.67        22
dischargesummary      

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s finished
  _warn_prf(average, modifier, msg_start, len(result))


### Bert

In [None]:
xtrain_orig, xtest_orig,xtrain, xtest, ytrain, ytest
lr_classifier_bert = get_classifier(LogisticRegression, xtrain, ytrain, {'random_state':13, 'verbose':1})
pred_train, pred_test = evaluate_classifier_on_train_test(lr_classifier_bert, xtrain, ytrain, xtest, ytest)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  46    0    0    0    0]
 [   0 1205    4    0   16]
 [   0    3  531    0    0]
 [   0    0    0   73    0]
 [   1   48    0    0  296]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       0.98      1.00      0.99        46
     no offer       0.96      0.98      0.97      1225
        offer       0.99      0.99      0.99       534
        price       1.00      1.00      1.00        73
relevant info       0.95      0.86      0.90       345

     accuracy                           0.97      2223
    macro avg       0.98      0.97      0.97      2223
 weighted avg       0.97      0.97      0.97      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  4   1   0   0   0]
 [  1 124   3   0   9]
 [  1   1  58   0   0]
 [  0   2   1   5   0]
 [  1  17   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s finished


## Parameter Tuning

In [119]:
param_dict = {'C':20, 'solver':'liblinear', 'max_iter':400, 'class_weight':'balanced', 'penalty':'l1','verbose':2}
lr_classifier = get_classifier(LogisticRegression, xtrain_tfidf, ytrain_tfidf, param_dict)
pred_train, pred_test = evaluate_classifier_on_train_test(lr_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[LibLinear]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[ 19   0   0   0   0   0   0   0   0]
 [  0 159   0   1   0   0   0   0   0]
 [  0   0  12   0   0   0   0   0   0]
 [  0   2   0 156   0   0   1   0   0]
 [  0   0   0   0  54   0   0   0   0]
 [  0   0   0   0   0  46   0   0   0]
 [  0   0   0   0   0   0  22   0   0]
 [  0   0   0   0   0   0   0  92   0]
 [  0   0   0   0   0   0   0   0  30]]
******************** Classification Report ********************
                  precision    recall  f1-score   support

              DL       1.00      1.00      1.00        19
        Invoices       0.99      0.99      0.99       160
       Member_Id       1.00      1.00      1.00        12
          Others       0.99      0.98      0.99       159
             PAN       1.00      1.00      1.00        54
      aadharcard       1.00      1.00      1.00        46
 cancelledcheque       0.96      1.00      0.98        22
dischargesu



### Weighted LR

In [82]:
data['label'].value_counts()

Invoices            200
Others              199
aadharcard          141
dischargesummary    115
voter_id             79
DL                   75
PAN                  66
cancelledcheque      28
Member_Id            15
Name: label, dtype: int64

In [None]:
unique, counts = np.unique(ytrain, return_counts=True)

print(np.asarray((unique, counts)).T)

NameError: ignored

In [None]:
w = {'no offer':1, 'offer': 2.3, 'relevant info': 3.5, 'price': 10, 'date':10}

In [None]:
param_dict = {'C':20, 'solver':'liblinear', 'max_iter':400, 'class_weight':w, 'penalty':'l1','verbose':2}
lr_classifier = get_classifier(LogisticRegression, xtrain_tfidf, ytrain_tfidf, param_dict)
pred_train, pred_test = evaluate_classifier_on_train_test(lr_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[LibLinear]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  46    0    0    0    0]
 [   0 1212    4    0    9]
 [   0    0  533    0    1]
 [   0    0    0   73    0]
 [   0    0    0    0  345]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       1.00      1.00      1.00        46
     no offer       1.00      0.99      0.99      1225
        offer       0.99      1.00      1.00       534
        price       1.00      1.00      1.00        73
relevant info       0.97      1.00      0.99       345

     accuracy                           0.99      2223
    macro avg       0.99      1.00      1.00      2223
 weighted avg       0.99      0.99      0.99      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  5   0   0   0   0]
 [  0 122   3   0  12]
 [  1   2  57   0   0]
 [  0   2   1   5   0]
 



In [None]:
distributions = {'C':[9,9.5, 10,11, 12], 'solver':['liblinear', 'saga'], 'max_iter':np.linspace(100,420,100)}
                #  'class_weight':[{'no offer':1, 'offer': 1, 'relevant info': 10, 'price': 20, 'date':20}, 
                #     {'no offer':5, 'offer': 5, 'relevant info': 50, 'price': 75, 'date':50}]}
param_dict = {'penalty':'l1','verbose':2, 'class_weight':{'no offer':1, 'offer': 1, 'relevant info': 1, 'price': 3004, 'date':1225}}
best_classifier = get_best_classifier(LogisticRegression, RandomizedSearchCV, xtrain_tfidf, ytrain_tfidf, classifier_param_dict=param_dict, distributions=distributions, cv=5)
pred_random_train1, pred_random_train2 = evaluate_classifier_on_train_test(best_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.7min finished


[LibLinear]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  46    0    0    0    0]
 [   1 1212    5    0    7]
 [   0    1  532    0    1]
 [   0    0    0   73    0]
 [   0   14    0    0  331]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       0.98      1.00      0.99        46
     no offer       0.99      0.99      0.99      1225
        offer       0.99      1.00      0.99       534
        price       1.00      1.00      1.00        73
relevant info       0.98      0.96      0.97       345

     accuracy                           0.99      2223
    macro avg       0.99      0.99      0.99      2223
 weighted avg       0.99      0.99      0.99      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  5   0   0   0   0]
 [  1 121   2   0  13]
 [  2   1  57   0   0]
 [  0   1   0   7   0]
 

In [None]:
best_classifier.best_params_

{'C': 9.5, 'max_iter': 213.13131313131314, 'solver': 'liblinear'}

In [None]:
distributions = {'class_weight':[{'no offer':1, 'offer': 1, 'relevant info': 1, 'price': 3004, 'date':1225}, 
                    {'no offer':100, 'offer': 1, 'relevant info': 1, 'price': 3004, 'date':3000}]}
param_dict = {'C':9.5, 'max_iter':213, 'penalty':'l1','verbose':2, 'solver':'liblinear'}
best_classifier2 = get_best_classifier(LogisticRegression, RandomizedSearchCV, xtrain_tfidf, ytrain_tfidf, classifier_param_dict=param_dict, distributions=distributions, cv=5)
pred_random_train_, pred_random_test_ = evaluate_classifier_on_train_test(best_classifier2, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   59.5s finished


[LibLinear]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  46    0    0    0    0]
 [   1 1212    5    0    7]
 [   0    1  532    0    1]
 [   0    0    0   73    0]
 [   0   14    0    0  331]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       0.98      1.00      0.99        46
     no offer       0.99      0.99      0.99      1225
        offer       0.99      1.00      0.99       534
        price       1.00      1.00      1.00        73
relevant info       0.98      0.96      0.97       345

     accuracy                           0.99      2223
    macro avg       0.99      0.99      0.99      2223
 weighted avg       0.99      0.99      0.99      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  5   0   0   0   0]
 [  1 120   2   1  13]
 [  2   1  57   0   0]
 [  0   1   0   7   0]
 



In [None]:
best_classifier2.best_params_

{'class_weight': {'date': 1225,
  'no offer': 1,
  'offer': 1,
  'price': 3004,
  'relevant info': 1}}

### Bert

In [None]:
param_dict = {'C':20, 'max_iter':150, 'solver':'liblinear', 'class_weight':'balanced', 'penalty':'l1','verbose':2}
lr_classifier_bert = get_classifier(LogisticRegression, xtrain, ytrain, param_dict)
pred_train, pred_test = evaluate_classifier_on_train_test(lr_classifier_bert, xtrain, ytrain, xtest, ytest)

[LibLinear]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[45  0  0  1  0  0  0]
 [ 0 14  0  1  0  0  0]
 [ 0  0 46  0  0  0  0]
 [ 0  0  0 27  0  0  0]
 [ 0  0  0  0 42  0  4]
 [ 0  0  0  0  0 15  0]
 [ 0  0  0  0  5  0 41]]
******************** Classification Report ********************
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        46
           1       1.00      0.93      0.97        15
           2       1.00      1.00      1.00        46
           3       0.93      1.00      0.96        27
           4       0.89      0.91      0.90        46
           5       1.00      1.00      1.00        15
           6       0.91      0.89      0.90        46

    accuracy                           0.95       241
   macro avg       0.96      0.96      0.96       241
weighted avg       0.96      0.95      0.95       241

Evaluation on Test Data ---------- >
******************** Conf

## Random Search Logistic

In [120]:
distributions = {'C':[0, 3, 5, 10], 'solver':['liblinear', 'saga']}
param_dict = {'class_weight':'balanced', 'penalty':'l1','verbose':2}
best_classifier = get_best_classifier(LogisticRegression, RandomizedSearchCV, xtrain_tfidf, ytrain_tfidf, classifier_param_dict=param_dict, distributions=distributions, cv=5)
pred_random_train, pred_random_test = evaluate_classifier_on_train_test(best_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   18.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 1 seconds
Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[ 19   0   0   0   0   0   0   0   0]
 [  0 153   0   6   0   0   0   1   0]
 [  0   0  12   0   0   0   0   0   0]
 [  0  16   0 141   0   0   2   0   0]
 [  0   0   0   0  54   0   0   0   0]
 [  0   0   0   0   0  46   0   0   0]
 [  0   0   0   0   0   0  22   0   0]
 [  0   0   0   3   0   0   0  89   0]
 [  0   0   0   0   0   0   0   0  30]]
******************** Classification Report ********************
                  precision    recall  f1-score   support

              DL       1.00      1.00      1.00        19
        Invoices       0.91      0.96      0.93       160
       Member_Id       1.00      1.00      1.00        12
          Others       0.94      0.89      0.91       159
             PAN       1.00      1.00      1.00        54
      aadharcard       1.00      1.00      1.00        46
 cancelledcheque       0.92      1.00      0.96

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s finished


In [170]:
best_classifier.best_params_

{'C': 10, 'solver': 'saga'}

In [171]:
info = """Random Search ('C':[0, 3, 5, 10], 'solver':['liblinear', 'saga'])\n
        Accuracy: 88%\n
         date       0.71      1.00      0.83         5\n
     no offer       0.92      0.88      0.90       137\n
        offer       0.94      0.98      0.96        60\n
        price       1.00      0.75      0.86         8\n
relevant info       0.65      0.68      0.67        38\n
        """
dir_name = "offer_classification/lr_random_88"

save_model_and_predictions(dir_name, save_model=True, info_file=info, save_test_pred = True, save_train_pred=True, classifier= best_classifier,
                               x_train=xtrain_tfidf_orig, x_test=xtest_tfidf_orig, y_train=ytrain_tfidf, y_test=ytest_tfidf,
                           y_pred_train=pred_random_train, y_pred_test=pred_random_test)


Directory created successfully


In [173]:
best_classifier_lr = joblib.load('offer_classification/lr_random_88/model.pkl')
tfidf_load = joblib.load('finalized_model.sav')

In [178]:
test_df = pd.read_csv('new_extracted_Data.csv')

In [179]:
test_set_ary = test_df['text'].array.to_numpy().reshape(-1,1)
test_set = tfidf_load.transform(test_set_ary)
test_pred = best_classifier_lr.predict(test_set)

In [180]:
test_df['pred_label'] = test_pred

In [183]:
(test_df['label'] != test_df['pred_label']).value_counts()

False    608
True     208
dtype: int64

In [None]:
test_df.to_csv('offer_classification/predictions.csv', index=False)

### Bert

In [None]:
distributions = {'C':[0, 3, 5, 10], 'solver':['liblinear', 'saga'], 'penalty':['l1','l2'], 'max_iter':np.linspace(70,500, 20)}
param_dict = {'class_weight':'balanced','verbose':2}
best_classifier_bert = get_best_classifier(LogisticRegression, RandomizedSearchCV, xtrain, ytrain, classifier_param_dict=param_dict, distributions=distributions, cv=5)
pred_random_bert_train, pred_random_bert_test = evaluate_classifier_on_train_test(best_classifier_bert, xtrain, ytrain, xtest, ytest)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   52.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 3 seconds
Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[44  2  0  0  0  0  0]
 [ 0 15  0  0  0  0  0]
 [ 0  0 42  0  1  3  0]
 [ 0  2  0 25  0  0  0]
 [ 0  0  0  0 43  0  3]
 [ 0  0  0  0  0 15  0]
 [ 0  0  0  0  8  0 38]]
******************** Classification Report ********************
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        46
           1       0.79      1.00      0.88        15
           2       1.00      0.91      0.95        46
           3       1.00      0.93      0.96        27
           4       0.83      0.93      0.88        46
           5       0.83      1.00      0.91        15
           6       0.93      0.83      0.87        46

    accuracy                           0.92       241
   macro avg       0.91      0.94      0.92       241
weighted avg       0.93      0.92      0.92       241

Evaluation on Test Data ---------- >
***

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s finished


#SVM

In [None]:
from sklearn.svm import SVC

### RBF kernel

In [None]:
param_dict = {'C':10, 'gamma':.0001, 'kernel':'linear', 'verbose':2}
rbf_classifier = get_classifier(SVC, xtrain_tfidf, ytrain_tfidf, param_dict)
pred_train, pred_test = evaluate_classifier_on_train_test(rbf_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[LibSVM]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  45    1    0    0    0]
 [   0 1209    4    0   12]
 [   0    0  532    0    2]
 [   0    0    0   73    0]
 [   0   21    1    0  323]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       1.00      0.98      0.99        46
     no offer       0.98      0.99      0.98      1225
        offer       0.99      1.00      0.99       534
        price       1.00      1.00      1.00        73
relevant info       0.96      0.94      0.95       345

     accuracy                           0.98      2223
    macro avg       0.99      0.98      0.98      2223
 weighted avg       0.98      0.98      0.98      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  5   0   0   0   0]
 [  1 117   6   0  13]
 [  1   1  58   0   0]
 [  0   0   1   7   0]
 [  

### Linear Kernel

In [None]:
param_dict = {'C':10, 'kernel':'linear', 'verbose':2}
linear_svm_classifier = get_classifier(SVC, xtrain_tfidf, ytrain_tfidf, param_dict)
pred_train, pred_test = evaluate_classifier_on_train_test(linear_svm_classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

[LibSVM]Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[  45    1    0    0    0]
 [   0 1209    4    0   12]
 [   0    0  532    0    2]
 [   0    0    0   73    0]
 [   0   21    1    0  323]]
******************** Classification Report ********************
               precision    recall  f1-score   support

         date       1.00      0.98      0.99        46
     no offer       0.98      0.99      0.98      1225
        offer       0.99      1.00      0.99       534
        price       1.00      1.00      1.00        73
relevant info       0.96      0.94      0.95       345

     accuracy                           0.98      2223
    macro avg       0.99      0.98      0.98      2223
 weighted avg       0.98      0.98      0.98      2223

Evaluation on Test Data ---------- >
******************** Confusion Matrix ********************
[[  5   0   0   0   0]
 [  1 117   6   0  13]
 [  1   1  58   0   0]
 [  0   0   1   7   0]
 [  

## Random Search

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
kernel = ['linear','rbf']
distributions = dict(gamma=gamma_range, C=C_range, kernel=kernel)
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
best_classifier_rbf = get_best_classifier(SVC, RandomizedSearchCV, xtrain_tfidf, ytrain_tfidf, distributions=distributions, cv=cv)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    6.3s finished


In [None]:
best_classifier_rbf.best_params_

{'C': 10000000000.0, 'gamma': 1.0, 'kernel': 'rbf'}

In [None]:
pred_train, pred_test = evaluate_classifier_on_train_test(best_classifier_rbf, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[45  0  0  1  0  0  0]
 [ 0 14  0  1  0  0  0]
 [ 0  0 46  0  0  0  0]
 [ 0  0  0 27  0  0  0]
 [ 0  0  0  0 44  0  2]
 [ 0  0  0  0  0 15  0]
 [ 0  0  0  0  7  0 39]]
******************** Classification Report ********************
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        46
           1       1.00      0.93      0.97        15
           2       1.00      1.00      1.00        46
           3       0.93      1.00      0.96        27
           4       0.86      0.96      0.91        46
           5       1.00      1.00      1.00        15
           6       0.95      0.85      0.90        46

    accuracy                           0.95       241
   macro avg       0.96      0.96      0.96       241
weighted avg       0.96      0.95      0.95       241

Evaluation on Test Data ---------- >
******************** Confusion Matri

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
info = """Random Search:\n
        str(distributions)\n

        ******************** Classification Report ********************\n
               precision    recall  f1-score   support\n

         date       0.62      1.00      0.77         5\n
     no offer       0.91      0.82      0.87       137\n
        offer       0.94      0.97      0.95        60\n
        price       1.00      0.88      0.93         8\n
relevant info       0.57      0.71      0.64        38\n

     accuracy                           0.85       248\n
    macro avg       0.81      0.88      0.83       248\n
 weighted avg       0.86      0.85      0.85       248\n
        """
dir_name = "offer_classification/svc_random_85"

save_model_and_predictions(dir_name, save_model=True, info_file=info, save_test_pred = True, save_train_pred=True, classifier= best_classifier,
                               x_train=xtrain_tfidf_orig, x_test=xtest_tfidf_orig, y_train=ytrain_tfidf, y_test=ytest_tfidf,
                           y_pred_train=pred_train, y_pred_test=pred_test)

Directory created successfully


# Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

In [22]:
distributions = {'alpha': np.logspace(-3,3,100)}
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
grid_classifier_nb = get_best_classifier(MultinomialNB, GridSearchCV, xtrain_tfidf, ytrain_tfidf, distributions=distributions, cv=cv)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   10.3s finished


In [23]:
pred_train, pred_test = evaluate_classifier_on_train_test(grid_classifier_nb, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

Evaluation on Train Data ---------- >
******************** Confusion Matrix ********************
[[ 17   7   0   1   0   0   0   0   0]
 [  0 160   0   0   0   0   0   0   0]
 [  0   0  12   0   0   0   0   0   0]
 [  0   5   0 154   0   0   1   0   0]
 [  0  29   0   0  63   0   0   0   0]
 [  0   1   0   0   0  46   0   0   0]
 [  0   1   0   0   0   0  22   0   0]
 [  0   0   0   0   0   0   0  92   0]
 [  0  10   0   0   0   0   0   0  31]]
******************** Classification Report ********************
                  precision    recall  f1-score   support

              DL       1.00      0.68      0.81        25
        Invoices       0.75      1.00      0.86       160
       Member_Id       1.00      1.00      1.00        12
          Others       0.99      0.96      0.98       160
             PAN       1.00      0.68      0.81        92
      aadharcard       1.00      0.98      0.99        47
 cancelledcheque       0.96      0.96      0.96        23
dischargesummary      

#SVC

In [None]:
from sklearn.svm import SVC
svm_classifier_rs=SVC(C = 10, gamma = 0.1, kernel = 'linear')
svm_classifier_rs.fit(xtrain,ytrain)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
preds_valid2 = svm_classifier_rs.predict(xtest)

In [None]:
print(metrics.confusion_matrix(ytest, preds_valid2))

print(metrics.classification_report(ytest, preds_valid2))

[[5 0 0 0 0 0 0]
 [1 0 1 0 0 0 0]
 [0 0 4 0 1 0 0]
 [0 0 0 3 0 0 0]
 [0 1 0 0 2 1 1]
 [0 0 0 0 0 2 0]
 [0 0 0 0 3 0 2]]
              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       0.00      0.00      0.00         2
           2       0.80      0.80      0.80         5
           3       1.00      1.00      1.00         3
           4       0.33      0.40      0.36         5
           5       0.67      1.00      0.80         2
           6       0.67      0.40      0.50         5

    accuracy                           0.67        27
   macro avg       0.61      0.66      0.62        27
weighted avg       0.65      0.67      0.65        27



Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier_rs=RandomForestClassifier(max_depth= 10, n_estimators = 500)
rf_classifier_rs.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
preds_valid3 = rf_classifier_rs.predict(xtest)

print(metrics.confusion_matrix(ytest, preds_valid3))

print(metrics.classification_report(ytest, preds_valid3))

[[5 0 0 0 0 0 0]
 [1 0 1 0 0 0 0]
 [0 0 4 0 1 0 0]
 [0 0 0 3 0 0 0]
 [0 1 0 0 3 0 1]
 [1 0 0 0 0 1 0]
 [0 0 0 0 4 0 1]]
              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       0.00      0.00      0.00         2
           2       0.80      0.80      0.80         5
           3       1.00      1.00      1.00         3
           4       0.38      0.60      0.46         5
           5       1.00      0.50      0.67         2
           6       0.50      0.20      0.29         5

    accuracy                           0.63        27
   macro avg       0.63      0.59      0.58        27
weighted avg       0.63      0.63      0.60        27



#Naive Bayes

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
scaler = MinMaxScaler()
xtrain_nb = scaler.fit_transform(xtrain)
xtest_nb = scaler.fit_transform(xtest)

parameters = {'alpha': np.logspace(-3,3,10)}
nb_classifier = MultinomialNB()
nb_classifier_rs = GridSearchCV(nb_classifier, param_grid = parameters, cv = 36,n_jobs=-1,verbose=10)
nb_classifier_rs.fit(xtrain_nb, ytrain)

Fitting 36 folds for each of 10 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0172s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0335s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0589s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0764s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1463s.) Setting batch_size=32.
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    1.2s finished


GridSearchCV(cv=36, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-03, 4.64158883e-03, 2.15443469e-02, 1.00000000e-01,
       4.64158883e-01, 2.15443469e+00, 1.00000000e+01, 4.64158883e+01,
       2.15443469e+02, 1.00000000e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [None]:
preds_valid4 = nb_classifier_rs.predict(xtest_nb)

In [None]:
print(metrics.confusion_matrix(ytest, preds_valid4))

print(metrics.classification_report(ytest, preds_valid4))

[[4 1 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 3 0 0 0 2 0]
 [0 0 0 3 0 0 0]
 [0 3 0 0 0 1 1]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 0 5]]
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.22      1.00      0.36         2
           2       0.00      0.00      0.00         5
           3       1.00      1.00      1.00         3
           4       0.00      0.00      0.00         5
           5       0.40      1.00      0.57         2
           6       0.83      1.00      0.91         5

    accuracy                           0.59        27
   macro avg       0.49      0.69      0.53        27
weighted avg       0.50      0.59      0.51        27



  _warn_prf(average, modifier, msg_start, len(result))


#XGBoost Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
#parameters_xgboost = {'learning_rate':[0.9, 0.01], 'n_estimators' : range(20, 140, 20)}
xb_classifier = GradientBoostingClassifier(learning_rate=0.001, n_estimators= 140)
#xb_classifier_rs = GridSearchCV(xb_classifier, param_grid = parameters_xgboost, cv = 6,n_jobs=-1,verbose=10)
xb_classifier.fit(xtrain, ytrain)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.001, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=140,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
preds_valid5 = xb_classifier.predict(xtest)

In [None]:
print(metrics.confusion_matrix(ytest, preds_valid5))

print(metrics.classification_report(ytest, preds_valid5))

[[5 0 0 0 0 0 0]
 [1 0 1 0 0 0 0]
 [0 0 1 0 2 0 2]
 [0 0 3 0 0 0 0]
 [1 0 0 0 1 0 3]
 [2 0 0 0 0 0 0]
 [0 0 1 0 2 0 2]]
              precision    recall  f1-score   support

           0       0.56      1.00      0.71         5
           1       0.00      0.00      0.00         2
           2       0.17      0.20      0.18         5
           3       0.00      0.00      0.00         3
           4       0.20      0.20      0.20         5
           5       0.00      0.00      0.00         2
           6       0.29      0.40      0.33         5

    accuracy                           0.33        27
   macro avg       0.17      0.26      0.20        27
weighted avg       0.22      0.33      0.26        27



  _warn_prf(average, modifier, msg_start, len(result))
