In [1]:
import numpy as np
import pandas as pd
import json, re, nltk, spacy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
import pickle

from warnings import filterwarnings
filterwarnings('ignore')
%matplotlib inline

## Loading the data

The data is in JSON format and we need to convert it to a dataframe.

In [2]:
with open(r'C:\Users\vnnit\Desktop\Projects\Support Ticket Classfication\Dataset\complaints-2021-05-14_08_16.json') as f:
    data = json.load(f)
df = pd.json_normalize(data)

In [95]:
def return_column_name(col_name: str) -> str:
    return col_name.split('.')[-1]
df.columns = list(map(return_column_name, df.columns))
df.columns

Index(['_index', '_type', '_id', '_score', 'tags', 'zip_code', 'complaint_id',
       'issue', 'date_received', 'state', 'consumer_disputed', 'product',
       'company_response', 'company', 'submitted_via', 'date_sent_to_company',
       'company_public_response', 'sub_product', 'timely',
       'complaint_what_happened', 'sub_issue', 'consumer_consent_provided'],
      dtype='object')

# Filling blank values and removing NaN values

In [12]:
#Assign nan in place of blanks in the complaints column
df[df.loc[:, 'complaint_what_happened'] == ''] = np.nan
#Remove all rows where complaints column is nan
df = df[~df['complaint_what_happened'].isnull()]
# resetting index value
df = df.reset_index(drop= True)

In [19]:
# Convert complaint_what_happened column to string for performing text operations
df['complaint_what_happened'] = df['complaint_what_happened'].astype(str)

# Removing unwanted text

In [20]:
def clean_text(text):
    text = text.lower()
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove words containing numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_repeated_char_words(text):
    # Use regex to remove words consisting of the same character repeated
    cleaned_text = re.sub(r'\b(\w)\1{1,}\b', '', text)
    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    doc = nlp(text)
    stop_words = list(STOPWORDS)
    lemmatized_text = ' '.join([token.lemma_ for token in doc if token.lemma_ not in stop_words])
    return lemmatized_text

def get_POS_tags(text):
    tags_required = {'NN', 'JJ', 'VB'}
    blob = TextBlob(text)
    relevant_words = ' '.join(
        [word for word, tag in blob.tags if tag in tags_required])
    return relevant_words

In [21]:
df_clean = pd.DataFrame(df['complaint_what_happened'].apply(clean_text))
df_clean['complaint_xxx_removed'] = df_clean['complaint_what_happened'].apply(remove_repeated_char_words)
df_clean['complaint_lemmatized'] = df_clean['complaint_xxx_removed'].apply(lemmatize_text)
df_clean['complaint_POS_removed'] = df_clean['complaint_lemmatized'].apply(get_POS_tags)

df_clean

Unnamed: 0,complaint_what_happened
0,good morning my name is xxxx xxxx and i apprec...
1,i upgraded my xxxx xxxx card in and was told b...
2,chase card was reported on however fraudulent ...
3,on while trying to book a xxxx xxxx ticket i c...
4,my grand son give me check for i deposit it in...
...,...
21067,after being a chase card customer for well ove...
21068,on wednesday xxxxxxxx i called chas my xxxx xx...
21069,i am not familiar with xxxx pay and did not un...
21070,i have had flawless credit for yrs ive had cha...


# Lemmatizing the text

# Removing rows where length of the sentence is less than 10

In [37]:
less_char_rows = list(df_clean[df_clean['complaint_POS_removed'].apply(lambda x: len(x) < 10)].index)
df_clean = df_clean.drop(index=less_char_rows)
df_clean = df_clean.reset_index(drop= True)
df_clean

[58,
 578,
 1105,
 1619,
 2813,
 3194,
 3771,
 4899,
 4988,
 7741,
 8070,
 8369,
 11611,
 13219,
 13281,
 17326,
 20236]

Unnamed: 0,complaint_what_happened,complaint_xxx_removed,complaint_lemmatized,complaint_POS_removed
0,good morning my name is xxxx xxxx and i apprec...,good morning my name is and i appreciate it if...,good morning name I appreciate help I put stop...,good morning name help stop chase bank service...
1,i upgraded my xxxx xxxx card in and was told b...,i upgraded my card in and was told by the agen...,I upgrade card tell agent upgrade anniversary ...,agent upgrade anniversary date change agent wr...
2,chase card was reported on however fraudulent ...,chase card was reported on however fraudulent ...,chase card report fraudulent application submi...,chase card report fraudulent application submi...
3,on while trying to book a xxxx xxxx ticket i c...,on while trying to book a ticket i came across...,try book ticket I come across offer apply towa...,try book ticket offer apply ticket reward card...
4,my grand son give me check for i deposit it in...,my grand son give me check for i deposit it in...,grand son give I check I deposit chase account...,grand son chase account fund clear chase bank ...
...,...,...,...,...
21050,after being a chase card customer for well ove...,after being a chase card customer for well ove...,chase card customer well decade offer multiple...,chase card customer decade offer multiple soli...
21051,on wednesday xxxxxxxx i called chas my xxxx xx...,on wednesday i called chas my visa credit card...,wednesday I call chas visa credit card provide...,wednesday chas visa credit card provider claim...
21052,i am not familiar with xxxx pay and did not un...,i am not familiar with pay and did not underst...,I familiar pay understand great risk provide c...,familiar pay great risk consumer safe chase ba...
21053,i have had flawless credit for yrs ive had cha...,i have had flawless credit for yrs ive had cha...,I flawless credit yrs I ve chase credit card c...,credit yrs chase credit card chase freedom pro...


## Feature Extraction
Transform the raw text into a matrix of TF-IDF features.

**max_df** helps eliminate terms that occur too often, often referred to as "corpus-specific stop words." Setting max_df = 0.95 means "ignore terms that appear in more than 95% of the complaints."

**min_df** is used to filter out terms that are too rare. Setting min_df = 2 means "ignore terms that appear in fewer than 2 complaints."

In [56]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
tfidf.fit(df_clean['complaint_POS_removed'])
tfidf_vectorised = tfidf.transform(df_clean['complaint_POS_removed'])

#### Create a document term matrix using fit_transform

The contents of a document term matrix are tuples of (complaint_id,token_id) tf-idf score:
The tuples that are not there have a tf-idf score of 0

## Manual Topic Modeling

By doing a trial-and-error approach to determine the optimal number of topics for your NMF model.

The key parameter to specify is the number of components, which represents the desired number of topics. This step is critical in the topic modeling process and will significantly influence the quality of your final topics.

In [102]:
len(tfidf.get_feature_names_out())

9829

In [59]:
from sklearn.decomposition import NMF
num_topics = 5
nmf_model = NMF(n_components=num_topics, random_state=40)
nmf_model.fit(tfidf_vectorised)

In [96]:
H = nmf_model.components_
H

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 6.97170218e-04, 9.26901744e-05],
       [0.00000000e+00, 6.14881949e-03, 0.00000000e+00, ...,
        5.46906777e-04, 0.00000000e+00, 1.00299600e-04],
       [9.24968952e-04, 0.00000000e+00, 5.02800944e-04, ...,
        9.36494997e-04, 2.67251334e-03, 0.00000000e+00],
       [8.44994639e-06, 9.49452018e-03, 0.00000000e+00, ...,
        3.47062483e-04, 8.29246683e-04, 5.82963487e-04],
       [0.00000000e+00, 9.34915005e-04, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [103]:
H.shape

(5, 9829)

# Print the Top15 words for each of the topics

In [63]:
words = np.array(tfidf.get_feature_names_out())
words

array(['aaarating', 'aarp', 'ab', ..., 'zombie', 'zone', 'zoom'],
      dtype=object)

In [64]:
len(words)

9829

In [65]:
# Get the top 15 words for each topic using advanced indexing
top_n = 15
top_words_indices_nmf = np.argsort(H, axis=1)[:, ::-1][:, :top_n]
topic_words_nmf = pd.DataFrame(
    data=words[top_words_indices_nmf],
    index=[f'Topic {i + 1}' for i in range(num_topics)],
    columns=[f'Word {i + 1}' for i in range(top_n)]
)
topic_words_nmf

**Observation**

Based on the identified topics, we can assign labels according to the associated products and services:

- Topic 1: Bank Account Services
- Topic 2: Credit Cards / Prepaid Cards
- Topic 3: Mortgages / Loans
- Topic 4: Theft / Dispute Reporting
- Topic 5: Other Issues

LDA

In [93]:
from sklearn.decomposition import LatentDirichletAllocation
num_topics = 5
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=40)
lda_model.fit(tfidf_vectorised)

In [104]:
L = nmf_model.components_
L.shape

(5, 9829)

In [84]:
# Get the top 15 words for each topic using advanced indexing
top_n = 15
top_words_indices_lda = np.argsort(L, axis=1)[:, ::-1][:, :top_n]
topic_words_lda = pd.DataFrame(
    data=words[top_words_indices_lda],
    index=[f'Topic {i + 1}' for i in range(num_topics)],
    columns=[f'Word {i + 1}' for i in range(top_n)]
)
topic_words_lda

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15
Topic 1,account,bank,check,chase,money,deposit,fund,close,branch,day,open,transfer,tell,number,business
Topic 2,credit,report,card,inquiry,chase,remove,account,hard,score,company,debt,open,information,limit,application
Topic 3,loan,mortgage,chase,home,modification,property,year,letter,rate,time,document,foreclosure,house,sale,bank
Topic 4,charge,card,chase,dispute,transaction,fee,purchase,merchant,fraud,claim,refund,service,fraudulent,time,credit
Topic 5,payment,late,balance,pay,fee,month,statement,day,time,auto,monthly,chase,date,credit,minimum


# Create the best topic for each complaint in terms of integer value 0,1,2,3 & 4

In [85]:
topic_results_nmf = nmf_model.transform(tfidf_vectorised)
topic_results_lda = lda_model.transform(tfidf_vectorised)
df_clean['Topic_NMF'] = topic_results_nmf.argmax(axis=1)
df_clean['Topic_LDA'] = topic_results_lda.argmax(axis=1)

array([[0.04009944, 0.58057072, 0.03904545, 0.03996454, 0.30031984],
       [0.05318172, 0.78594648, 0.05289159, 0.0538454 , 0.05413481],
       [0.04774734, 0.04838671, 0.04771991, 0.04822261, 0.80792343],
       ...,
       [0.02152637, 0.91163535, 0.0217036 , 0.02278695, 0.02234772],
       [0.02783164, 0.89082243, 0.02692738, 0.02731242, 0.02710613],
       [0.03611158, 0.85604958, 0.03547385, 0.03610928, 0.0362557 ]])

In [91]:
topic_results_lda.argmax(axis=1)

array([1, 1, 4, ..., 1, 1, 1], dtype=int64)

### Create the dictionary of Topic names and Topics

In [57]:
topic_names = {
    0: "Bank account services",
    1: "Credit Card/Prepaid Card",
    2: "Mortgages/Loans",
    3: "Theft/Dispute reporting",
    4: "Others"
}
#Replace Topics with Topic Names
# df_clean['topic_names'] = df_clean['Topic_NMF'].map(topic_names)

In [59]:
df_clean['Topic_NMF'].value_counts()

3    5386
0    5072
1    4413
2    3892
4    2298
Name: Topic, dtype: int64

In [None]:
df_clean['Topic_LDA'].value_counts()

In [61]:
df_clean.shape

(21061, 6)

## Supervised model to predict complaints to the relevant Topics.

Building the model to create the topics for each complaints.

In [62]:
df_clean.head()

Unnamed: 0,complaint_what_happened,complaint_lemmatized,complaint_POS_removed,complaint_final,Topic,topic_names
0,good morning my name is xxxx xxxx and i apprec...,good morning my name be xxxx xxxx and I apprec...,good morning name be help put stop chase bank ...,good morning name be help put stop chase bank ...,2,Mortgages/Loans
1,i upgraded my xxxx xxxx card in and was told b...,I upgrade my xxxx xxxx card in and be tell by ...,xxxx xxxx card be agent upgrade anniversary da...,card be agent upgrade anniversary date change ...,3,Theft/Dispute reporting
2,chase card was reported on however fraudulent ...,chase card be report on however fraudulent app...,chase card be report fraudulent application be...,chase card be report fraudulent application be...,1,Credit Card/Prepaid Card
3,on while trying to book a xxxx xxxx ticket i c...,on while try to book a xxxx xxxx ticket I come...,try book xxxx xxxx ticket offer be apply ticke...,try book ticket offer be apply ticket reward c...,1,Credit Card/Prepaid Card
4,my grand son give me check for i deposit it in...,my grand son give I check for I deposit it int...,grand son chase account fund clear chase bank ...,grand son chase account fund clear chase bank ...,0,Bank account services


# Training Data

In [63]:
#Keep the columns"complaint_what_happened" & "Topic" only in the new dataframe --> training_data
training_data = df_clean[['complaint_final', 'Topic']]
training_data

Unnamed: 0,complaint_final,Topic
0,good morning name be help put stop chase bank ...,2
1,card be agent upgrade anniversary date change ...,3
2,chase card be report fraudulent application be...,1
3,try book ticket offer be apply ticket reward c...,1
4,grand son chase account fund clear chase bank ...,0
...,...,...
21056,be chase card customer decade be multiple soli...,1
21057,wednesday chas visa credit card provider ask m...,3
21058,be familiar pay do understand great risk provi...,3
21059,credit yrs credit card chase freedom problem m...,4


# Train Test Split

In [64]:
from gensim.models import Word2Vec

In [65]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    training_data['complaint_final'],
    training_data['Topic'],
    test_size=0.2,
    random_state=42)

# Count Vectorization

In [66]:
count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# TF-IDF Vectorization

In [67]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Average Word2Vec Vectorization using

In [68]:
# Tokenize sentences into words
train_sentences = [sentence.split() for sentence in X_train]
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [69]:
# function to average Word2Vec vectors
def get_word2vec_vector(sentence):
    words = sentence.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return sum(word_vectors) / len(word_vectors) if word_vectors else np.zeros(word2vec_model.vector_size)

In [70]:
# Apply to training and test data
X_train_word2vec = np.array([get_word2vec_vector(sentence) for sentence in X_train])
X_test_word2vec = np.array([get_word2vec_vector(sentence) for sentence in X_test])

# Model Building and Evaluation with default parameters

### Model Evaluation Function

In [71]:
def print_confusion_matrix_and_report(y_true, y_pred):
    metrics = {
        'Confusion Matrix': confusion_matrix(y_true, y_pred),
        'Classification Report': classification_report(y_true, y_pred)
    }
    for key, value in metrics.items():
        if key == 'Confusion Matrix':
            print(f"\n{key}:\n{value}")
        elif key == 'Classification Report':
            print(f"\n{key}:\n{value}")
        else:
            print(f"{key}: {value:.4f}")

# Model fitting

### function to return accuracy, precision, recall and f1_score

In [72]:
def return_metrics(y_true, y_pred):
    return (accuracy_score(y_true, y_pred),
            precision_score(y_true,
                            y_pred,
                            average='weighted',
                            zero_division=0),
            recall_score(y_true, y_pred, average='weighted', zero_division=0),
            f1_score(y_true, y_pred, average='weighted', zero_division=0))

### Function to fit the models and return the model metrics in a dataframe

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [74]:
def evaluate_models(X_dict, y_train, y_test):
    """Evaluate models with different vectorization methods."""
    models = {
        'logistic_regression': LogisticRegression(),
        'decision_tree': DecisionTreeClassifier(),
        'random_forest': RandomForestClassifier()
    }

    results = []

    for vect_type, (X_train, X_test) in X_dict.items():
        for model_name, model in models.items():
            model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            train_metrics = return_metrics(y_train, y_train_pred)
            test_metrics = return_metrics(y_test, y_test_pred)

            results.append({
                'vect_type': vect_type,
                'Model': model_name,
                'train_accuracy': train_metrics[0],
                'train_precision': train_metrics[1],
                'train_recall': train_metrics[2],
                'train_f1score': train_metrics[3],
                'test_accuracy': test_metrics[0],
                'test_precision': test_metrics[1],
                'test_recall': test_metrics[2],
                'test_f1score': test_metrics[3]
            })

    return pd.DataFrame(results)

In [75]:
X_dict = {
    'count_vect': (X_train_count, X_test_count),
    'tfidf_vec': (X_train_tfidf, X_test_tfidf),
    'word2vec': (X_train_word2vec, X_test_word2vec)
}

model_metric_df = evaluate_models(X_dict, y_train, y_test)

In [76]:
model_metric_df.sort_values(by=['train_accuracy', 'test_accuracy'])

Unnamed: 0,vect_type,Model,train_accuracy,train_precision,train_recall,train_f1score,test_accuracy,test_precision,test_recall,test_f1score
6,word2vec,logistic_regression,0.940408,0.940392,0.940408,0.940391,0.941372,0.941295,0.941372,0.941312
3,tfidf_vec,logistic_regression,0.991156,0.991173,0.991156,0.991152,0.96226,0.962464,0.96226,0.962164
0,count_vect,logistic_regression,0.999763,0.999763,0.999763,0.999763,0.971517,0.971571,0.971517,0.971499
7,word2vec,decision_tree,1.0,1.0,1.0,1.0,0.77427,0.775209,0.77427,0.774669
1,count_vect,decision_tree,1.0,1.0,1.0,1.0,0.792784,0.793846,0.792784,0.792977
4,tfidf_vec,decision_tree,1.0,1.0,1.0,1.0,0.813435,0.813894,0.813435,0.813607
2,count_vect,random_forest,1.0,1.0,1.0,1.0,0.86779,0.869749,0.86779,0.864856
8,word2vec,random_forest,1.0,1.0,1.0,1.0,0.891526,0.891348,0.891526,0.890767
5,tfidf_vec,random_forest,1.0,1.0,1.0,1.0,0.894375,0.894987,0.894375,0.892413


**Better Model**:
    
    LogisticRegression
**Better Vectoriser**:
    
    Word2Vec

### Performing Logistic Regression Hyperparameter Tuning with Word2Vec vectorised input

In [77]:
logistic_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200],
    'class_weight': [None, 'balanced']
}

In [78]:
logistic_grid = RandomizedSearchCV(LogisticRegression(n_jobs=-1),
                                   logistic_params,
                                   cv=5,
                                   scoring='f1')
logistic_grid.fit(X_train_word2vec, y_train)

In [79]:
print("Best parameters for Logistic Regression:", logistic_grid.best_params_)
print("Best score for Logistic Regression:", logistic_grid.best_score_)

Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 200, 'class_weight': None, 'C': 0.1}
Best score for Logistic Regression: nan


In [80]:
# Evaluate on test set
logistic_best = logistic_grid.best_estimator_
y_pred_logistic = logistic_best.predict(X_test_word2vec)
print(classification_report(y_test, y_pred_logistic))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1025
           1       0.92      0.95      0.94       845
           2       0.93      0.93      0.93       777
           3       0.92      0.92      0.92      1105
           4       0.91      0.83      0.87       461

    accuracy                           0.92      4213
   macro avg       0.92      0.92      0.92      4213
weighted avg       0.92      0.92      0.92      4213



### Performing Random Forest Hyperparameter Tuning with Word2Vec vectorised input

In [81]:
# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

In [82]:
rf_grid = RandomizedSearchCV(RandomForestClassifier(),
                             rf_params,
                             cv=5,
                             scoring='f1',
                            n_jobs=-1)
rf_grid.fit(X_train_word2vec, y_train)

In [83]:
print("Best parameters for Random Forest:", rf_grid.best_params_)
print("Best score for Random Forest:", rf_grid.best_score_)

Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None, 'class_weight': 'balanced'}
Best score for Random Forest: nan


In [84]:
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_word2vec)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1025
           1       0.88      0.92      0.90       845
           2       0.89      0.91      0.90       777
           3       0.89      0.89      0.89      1105
           4       0.89      0.75      0.81       461

    accuracy                           0.89      4213
   macro avg       0.89      0.88      0.88      4213
weighted avg       0.89      0.89      0.89      4213



In [85]:
get_word2vec_vector('I would like to raise a ticket regarding an issue with my mortgage/loan account and request assistance with [briefly describe the issue, e.g., "my monthly payment," "loan application status," or "interest rate clarification"].')

array([ 1.13844268e-01, -4.26322192e-01,  2.74345223e-02,  1.14055015e-01,
       -3.08428556e-01,  9.81488228e-02,  4.75533754e-01,  3.00480485e-01,
        3.44085135e-02, -3.05940777e-01,  3.06768119e-01,  3.43318731e-01,
       -1.69086829e-02,  4.59753990e-01, -3.87222022e-01,  1.46467656e-01,
       -2.04559892e-01, -1.42432362e-01, -5.13573885e-01, -8.34044456e-01,
        1.95293859e-01,  2.07459867e-01, -1.35524943e-01,  2.94456165e-02,
        2.11721897e-01, -1.23489216e-01, -1.58975452e-01, -4.90404248e-01,
        3.48568469e-01, -8.11875835e-02, -1.64196759e-01, -4.11090255e-02,
       -3.35118860e-01, -4.11318988e-01, -1.43005237e-01,  4.14700061e-01,
        9.90844965e-02,  4.20120597e-01,  3.50674540e-01, -8.19889128e-01,
        4.72351193e-01, -3.90500203e-02, -1.35133758e-01,  6.31682277e-01,
        6.29187346e-01, -2.52031833e-01,  3.46173674e-01, -5.76978587e-02,
        6.23302519e-01,  6.52984798e-01, -6.67217672e-01, -6.34962499e-01,
        6.87574208e-01, -

# Testing on whole sentence

In [86]:
test_text = 'I would like to raise a ticket regarding an issue with my mortgage or loan account and request assistance with [briefly describe the issue, e.g., "my monthly payment," "loan application status," or "interest rate clarification"].'
test_word2vec = get_word2vec_vector(get_POS_tags(lemmatize_text(clean_text(test_text))))
logistic_best.predict([test_word2vec])

array([2], dtype=int64)

In [87]:
test_text = 'I want to know how to create a bank account in JP morgan chase bank'
test_word2vec = get_word2vec_vector(get_POS_tags(lemmatize_text(clean_text(test_text))))
logistic_best.predict([test_word2vec])

array([0], dtype=int64)

In [90]:
import re
import numpy as np
from textblob import TextBlob
import spacy

class TextProcessor:
    def __init__(self, word2vec_model):
        self.word2vec_model = word2vec_model
        self.nlp = spacy.load('en_core_web_sm')  # Load your preferred spaCy model

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\b\w*\d\w*\b', '', text)  # Remove words containing numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        return text

    def lemmatize_text(self, text):
        doc = self.nlp(text)
        lemmatized_text = ' '.join([token.lemma_ for token in doc])
        return lemmatized_text

    def get_POS_tags(self, text):
        tags_required = {'NN', 'JJ', 'VB'}
        blob = TextBlob(text)
        relevant_words = ' '.join(
            [word for word, tag in blob.tags if tag in tags_required]
        )
        return relevant_words

    def get_word2vec_vector(self, sentence):
        words = sentence.split()
        word_vectors = [
            self.word2vec_model.wv[word] for word in words if word in self.word2vec_model.wv
        ]
        return sum(word_vectors) / len(word_vectors) if word_vectors else np.zeros(self.word2vec_model.vector_size)

    def process_text(self, text):
        cleaned_text = self.clean_text(text)
        lemmatized_text = self.lemmatize_text(cleaned_text)
        pos_tags_text = self.get_POS_tags(lemmatized_text)
        return self.get_word2vec_vector(pos_tags_text)

In [91]:
# Usage
# Assuming `word2vec_model` is already defined
processor = TextProcessor(word2vec_model)
test_text = 'I want to know how to create a bank account in JP Morgan Chase Bank'
test_word2vec = processor.process_text(test_text)

In [92]:
logistic_best.predict([test_word2vec])

array([0], dtype=int64)

# Saving Word2Vec Model for text preprocessing

In [94]:
with open(r'Models/word2vec_model.pkl', 'wb') as f:
    pickle.dump(word2vec_model, f)

# Saving the Logistic Regression ML model to classify the ticket

In [95]:
with open(r'Models/support_classification_model.pkl', 'wb') as f:
    pickle.dump(logistic_best, f)