## Import Libraries

In [66]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss
from sklearn.ensemble import GradientBoostingClassifier

## Define files

In [2]:
tr_file = 'Data/train-data.dat'
tr_lbl_file = 'Data/train-label.dat'
t_file = 'Data/test-data.dat'
t_lbl_file = 'Data/test-label.dat'
labels_file = 'Data/labels.txt'
vocab_file = 'Data/vocabs.txt'

## Functions to use

In [72]:
def read_data_per_sentence(txt_file, data_file):
    
    dictionary = {}
    data = []
    labels = []
    
    with open(txt_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            # splits the line into two values, the one before the comma (token) and the one after the comma (indx)           
            token, indx = line.split(', ')
            # create a dictionary of the form {indx: token}
            dictionary[int(indx)] = token
    
    with open(data_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            line_data = line.split(' ')
            # takes the first element of line_data by removing <> and taking only the element inside the brackets 
            # e.g. <215> results in 215 because 2,1,5 are the line_data[0][1], line_data[0][2], and line_data[0][3] respectively
            num_doc_sentences = int(line_data[0][1:-1])  
            doc_index = 1
            doc_sentences = []  # Store sentences of the document
            
            # Loop through the sentences of the document
            for i in range(num_doc_sentences):
                # takes the second element of line_data by removing <> and taking only the element inside the brackets
                num_sentence_tokens = int(line_data[doc_index][1:-1])
                doc_index += 1
                sentence = []
                
                for j in range(doc_index, num_sentence_tokens + doc_index):
                    # Take the indexes of the tokens of the sentence and find the corresponding words from the vocabulary
                    sentence.append(dictionary[int(line_data[j])])
                
                doc_index += num_sentence_tokens
                data.append(' '.join(sentence))
            

    return dictionary, data

def multilabel_to_binary(labels, most_frequent_class):

    binary_labels = np.zeros(labels.shape[0], dtype=int)
    binary_labels[labels[:, most_frequent_class] == 1] = 1

    return binary_labels


def number_of_clusters(input_data):
    sse = {}
    
    for k in range(3, 21):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(input_data)
        sse[k] = kmeans.inertia_
        
    # Plot the sse values
    plt.plot(range(3, 21), list(sse.values()), marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.show()    
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee 
    return k


def sentence_cluster(data,labels):
    cluster_sentences = {}
    
    # Iterate over each sentence and its corresponding cluster label
    for sentence, cluster_label in zip(data, labels):
        cluster_sentences[sentence] = cluster_label
        
    return cluster_sentences  


def read_data_per_doc(vocab, data_file, labels_file, cluster_sentences=None):
    data = []
    assigned_data = []
    labels = []
    
    with open(data_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            line_data = line.split(' ')
            num_doc_sentences = int(line_data[0][1:-1])  
            doc_index = 1
            doc = ''
            sent_in_doc = ''
            cluster = []
            # loop through the sentences of the document
            for i in range(num_doc_sentences):
                # takes the second element of line_data by removing <> and taking only the element inside the brackets
                num_sentence_tokens = int(line_data[doc_index][1:-1])           
                doc_index += 1
                sent_in_doc = ''
                for i in range(doc_index, num_sentence_tokens + doc_index):
                    sent_in_doc += vocab[int(line_data[i])] + ' '
                    doc += vocab[int(line_data[i])] + ' '
                doc_index += num_sentence_tokens
                if cluster_sentences is not None:
                    cluster.append(cluster_sentences[sent_in_doc.rstrip()])
            data.append(doc)
            assigned_data.append(cluster)
            
    with open(labels_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            doc_labels = [int(output) for output in line.split(' ')]
            labels.append(doc_labels)            
            
    return data, assigned_data, np.int8(labels)


def docs_per_cluster(num_clusters, data):

    # Determine the total number of columns
    num_columns = num_clusters
    
    # Create an empty DataFrame with the appropriate number of columns
    column_names = [f'cluster_{i}' for i in range(num_columns)] 
    df = pd.DataFrame(columns=column_names)
    
    # Iterate over the documents and populate the DataFrame
    for doc in data:
        row = [0] * num_columns  # Initialize a row with all zeros
        for group in doc:
            row[group] += 1  # Increment the count for the corresponding group
        df.loc[len(df)] = row  # Append the row to the DataFrame
        
    return df

def train_binary_clf(x_train, y_train, x_test, y_test):
    classifiers = {
        'Gaussian Naive Bayes Classifier': GaussianNB(),
        'Logistic Regression Classifier': LogisticRegression(random_state=42),
        'Support Vector Machine Classifier': LinearSVC(random_state=42),
        'K-nearest Neighbors Classifier': KNeighborsClassifier(n_neighbors=5),
        'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)
    }
    
    for classifier_name, clf in classifiers.items():
        clf.fit(x_train, y_train)
        pred_labels = clf.predict(x_test)
        print('\nEvaluating {}'.format(classifier_name))
        print(classification_report(y_test, pred_labels, zero_division='warn'))
        print('Subset accuracy = {}'.format((1 - zero_one_loss(test_binary_labels, pred_labels))))

# First Approach 

## Read data from files - Sentence level

In [4]:
vocab, train_sent = read_data_per_sentence(vocab_file, tr_file)
_, test_sent = read_data_per_sentence(vocab_file, t_file)

In [5]:
print('\nFirst 3 train sentences\n')
for i in range(3):
    print('{}:\t{}'.format(i, train_sent[i]))
print('\nExtracted {} sentences'.format(len(train_sent)))


First 3 train sentences

0:	rubi rail helper demo more info auto complet
1:	see new helper action
2:	null length substr locat

Extracted 149925 sentences


In [6]:
print('\nExtracted {} sentences'.format(len(test_sent)))


Extracted 73363 sentences


## Turn sentences into vectors

In [7]:
tfidf_vectorizer = TfidfVectorizer()
train_inputs = tfidf_vectorizer.fit_transform(train_sent)
test_inputs = tfidf_vectorizer.transform(test_sent)

train_inputs.shape, test_inputs.shape

((149925, 8520), (73363, 8520))

## Cluster training sentences - kmeans

In [8]:
# k = number_of_clusters(train_inputs)
k = 13
kmeans = KMeans(n_clusters=k, random_state=42)
train_clusters = kmeans.fit(train_inputs)
test_clusters = kmeans.predict(test_inputs)
cluster_labels=kmeans.labels_



In [9]:
from collections import Counter
cluster_counts = Counter(cluster_labels)
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} sentences")

Cluster 6: 5006 sentences
Cluster 12: 4037 sentences
Cluster 0: 106234 sentences
Cluster 10: 4499 sentences
Cluster 9: 3555 sentences
Cluster 7: 8569 sentences
Cluster 4: 1800 sentences
Cluster 8: 3940 sentences
Cluster 1: 2526 sentences
Cluster 5: 3876 sentences
Cluster 2: 1943 sentences
Cluster 3: 730 sentences
Cluster 11: 3210 sentences


In [10]:
for i, cluster_label in enumerate(test_clusters):
    print(f"Test sentence {i}: Cluster {cluster_label}")

Test sentence 0: Cluster 11
Test sentence 1: Cluster 7
Test sentence 2: Cluster 0
Test sentence 3: Cluster 2
Test sentence 4: Cluster 0
Test sentence 5: Cluster 0
Test sentence 6: Cluster 0
Test sentence 7: Cluster 10
Test sentence 8: Cluster 0
Test sentence 9: Cluster 0
Test sentence 10: Cluster 10
Test sentence 11: Cluster 0
Test sentence 12: Cluster 0
Test sentence 13: Cluster 0
Test sentence 14: Cluster 2
Test sentence 15: Cluster 10
Test sentence 16: Cluster 0
Test sentence 17: Cluster 0
Test sentence 18: Cluster 1
Test sentence 19: Cluster 6
Test sentence 20: Cluster 0
Test sentence 21: Cluster 0
Test sentence 22: Cluster 7
Test sentence 23: Cluster 7
Test sentence 24: Cluster 0
Test sentence 25: Cluster 7
Test sentence 26: Cluster 7
Test sentence 27: Cluster 0
Test sentence 28: Cluster 7
Test sentence 29: Cluster 0
Test sentence 30: Cluster 0
Test sentence 31: Cluster 0
Test sentence 32: Cluster 8
Test sentence 33: Cluster 0
Test sentence 34: Cluster 0
Test sentence 35: Cluster 

Test sentence 55338: Cluster 0
Test sentence 55339: Cluster 8
Test sentence 55340: Cluster 0
Test sentence 55341: Cluster 0
Test sentence 55342: Cluster 0
Test sentence 55343: Cluster 0
Test sentence 55344: Cluster 5
Test sentence 55345: Cluster 0
Test sentence 55346: Cluster 0
Test sentence 55347: Cluster 9
Test sentence 55348: Cluster 0
Test sentence 55349: Cluster 0
Test sentence 55350: Cluster 0
Test sentence 55351: Cluster 0
Test sentence 55352: Cluster 0
Test sentence 55353: Cluster 0
Test sentence 55354: Cluster 0
Test sentence 55355: Cluster 0
Test sentence 55356: Cluster 0
Test sentence 55357: Cluster 7
Test sentence 55358: Cluster 11
Test sentence 55359: Cluster 0
Test sentence 55360: Cluster 0
Test sentence 55361: Cluster 0
Test sentence 55362: Cluster 0
Test sentence 55363: Cluster 0
Test sentence 55364: Cluster 6
Test sentence 55365: Cluster 0
Test sentence 55366: Cluster 0
Test sentence 55367: Cluster 5
Test sentence 55368: Cluster 10
Test sentence 55369: Cluster 0
Test s

Test sentence 65876: Cluster 0
Test sentence 65877: Cluster 0
Test sentence 65878: Cluster 0
Test sentence 65879: Cluster 0
Test sentence 65880: Cluster 0
Test sentence 65881: Cluster 0
Test sentence 65882: Cluster 0
Test sentence 65883: Cluster 0
Test sentence 65884: Cluster 6
Test sentence 65885: Cluster 0
Test sentence 65886: Cluster 6
Test sentence 65887: Cluster 0
Test sentence 65888: Cluster 6
Test sentence 65889: Cluster 0
Test sentence 65890: Cluster 0
Test sentence 65891: Cluster 1
Test sentence 65892: Cluster 0
Test sentence 65893: Cluster 0
Test sentence 65894: Cluster 0
Test sentence 65895: Cluster 0
Test sentence 65896: Cluster 0
Test sentence 65897: Cluster 0
Test sentence 65898: Cluster 0
Test sentence 65899: Cluster 0
Test sentence 65900: Cluster 4
Test sentence 65901: Cluster 0
Test sentence 65902: Cluster 7
Test sentence 65903: Cluster 1
Test sentence 65904: Cluster 1
Test sentence 65905: Cluster 1
Test sentence 65906: Cluster 0
Test sentence 65907: Cluster 0
Test sen

## Create a dictionary to store the sentences and their cluster

In [28]:
cluster_tr_sentences = sentence_cluster(train_sent, cluster_labels)

In [31]:
cluster_tr_sentences

{'rubi rail helper demo more info auto complet': 6,
 'see new helper action': 12,
 'null length substr locat': 0,
 'exec messag messag pleas edit remov follow word content': 0,
 'roll stone com news song previou next page': 0,
 'good vibrat beach boy': 0,
 'smell teen spirit nirvana': 0,
 'want hold hand beatl': 0,
 'hound dog elvi': 0,
 'god know beach boy': 0,
 'walk line johnni cash': 0,
 'heaven led zeppelin': 0,
 'sympathi devil roll stone': 0,
 'river deep mountain high turner': 0,
 'woman cri bob': 0,
 'day buddi holli': 0,
 'georgia mind ray charl': 0,
 'heartbreak hotel elvi': 0,
 'bridg over troubl water simon': 0,
 'track tear robinson miracl': 0,
 'messag flash five': 0,
 'man love woman': 0,
 'long tall salli littl richard': 0,
 'whole jerri lee lewi': 0,
 'california girl beach boy': 0,
 'brand new bag jame brown': 12,
 'whole love led zeppelin': 0,
 'strawberri field forev beatl': 0,
 'mysteri train elvi': 0,
 'feel good jame brown': 0,
 'substr remov addit valu return f

In [29]:
cluster_t_sentences = sentence_cluster(test_sent, test_clusters)

In [32]:
cluster_t_sentences

{'here ad comment motiv behind convent': 11,
 'apolog length realli use stuff': 7,
 'base previou standard clear few area confus': 0,
 'rule name don use acronym': 2,
 'exampl gener don creat doubl see special section below': 0,
 'don make class': 0,
 'find out someth similar alreadi exist first': 0,
 'one sub optim way better five perfect way': 10,
 'fewer class better don gener earli': 0,
 'code easier read call well known function local function': 0,
 'code reus one librari portion system': 10,
 'chang share code chang both place': 0,
 'don put code class need class data': 0,
 'don someon read code make assumpt base current implement': 0,
 'chang function method intent chang name': 2,
 'turn out one import rule': 10,
 'even unit test realli hard track depend': 0,
 'need code coverag tool exhaust test': 0,
 'origin lot file singl directori each file itself be modul': 1,
 'more recent python project use more model': 6,
 'list list alway item same type': 0,
 'posit inform list matter':

## Read data from files - Document level
### Represent document as a vector of clusters

In [34]:
train_doc, train_doc_clustered, train_labels = read_data_per_doc(vocab, tr_file,tr_lbl_file, cluster_tr_sentences)
test_doc, test_doc_clustered, test_labels = read_data_per_doc(vocab, t_file,t_lbl_file, cluster_t_sentences)

In [36]:
print('\nFirst 5 train documents\n')
for i in range(5):
    print('{}:\t{}'.format(i, train_doc[i]))
print('\nExtracted {} documents'.format(len(train_doc)))

print()
for i in range(5):
    print('{}:\t{}'.format(i, train_doc_clustered[i]))
print('\nExtracted {} documents'.format(len(train_doc_clustered)))


First 5 train documents

0:	rubi rail helper demo more info auto complet see new helper action 
1:	null length substr locat exec messag messag pleas edit remov follow word content roll stone com news song previou next page good vibrat beach boy smell teen spirit nirvana want hold hand beatl hound dog elvi god know beach boy walk line johnni cash heaven led zeppelin sympathi devil roll stone river deep mountain high turner woman cri bob day buddi holli georgia mind ray charl heartbreak hotel elvi bridg over troubl water simon track tear robinson miracl messag flash five man love woman long tall salli littl richard whole jerri lee lewi california girl beach boy brand new bag jame brown whole love led zeppelin strawberri field forev beatl mysteri train elvi feel good jame brown 
2:	substr remov addit valu return function result fals progress through version final present adopt wide rang industri project spring focus around provid way manag busi object spring both comprehens modular maxim

In [37]:
print('\nFirst 5 test documents\n')
for i in range(5):
    print('{}:\t{}'.format(i, test_doc[i]))
print('\nExtracted {} documents'.format(len(test_doc)))

print()
for i in range(5):
    print('{}:\t{}'.format(i, test_doc_clustered[i]))
print('\nExtracted {} documents'.format(len(test_doc_clustered)))


First 5 test documents

0:	here ad comment motiv behind convent apolog length realli use stuff base previou standard clear few area confus rule name don use acronym exampl gener don creat doubl see special section below don make class find out someth similar alreadi exist first one sub optim way better five perfect way fewer class better don gener earli code easier read call well known function local function code reus one librari portion system chang share code chang both place don put code class need class data don someon read code make assumpt base current implement chang function method intent chang name turn out one import rule even unit test realli hard track depend need code coverag tool exhaust test origin lot file singl directori each file itself be modul more recent python project use more model list list alway item same type posit inform list matter fact today add use tupl local track depend between place use someth place pack someth hard same fragil exist lot lisp softwar 

## Convert our multi-label problem to a binary classification problem

In [38]:
train_labels

array([[1, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int8)

In [39]:
test_labels

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

In [42]:
class_frequencies = np.sum(train_labels, axis=0)
most_frequent_class = np.argmax(class_frequencies)
print('Most frequent class:',format(most_frequent_class))

Most frequent class: 2


In [44]:
train_binary_labels = multilabel_to_binary(train_labels,most_frequent_class)
test_binary_labels = multilabel_to_binary(test_labels,most_frequent_class)

for i in range(10):
    print('Original: {}\tBinary: {}'.format(train_labels[i], train_binary_labels[i]))

Original: [1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]	Binary: 1
Original: [0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]	Binary: 1
Original: [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0]	Binary: 0
Original: [1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]	Binary: 1
Original: [1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0]	Binary: 1
Original: [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]	Binary: 0


## Modify the datasets. Columns are now cluster numbers

In [54]:
train_doc_clustered = docs_per_cluster(k, train_doc_clustered)

In [55]:
train_doc_clustered

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12
0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,27,0,0,0,0,0,0,0,0,0,0,0,1
2,20,0,0,0,0,0,1,6,0,1,3,0,0
3,2,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,2,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8246,7,0,0,0,0,0,0,0,0,0,0,2,0
8247,27,0,0,0,0,0,0,0,0,0,0,0,0
8248,21,0,0,0,0,0,0,0,3,0,0,3,3
8249,3,1,0,0,0,0,0,0,1,0,0,0,0


In [56]:
train_binary_labels.shape

(8251,)

In [64]:
test_doc_clustered = docs_per_cluster(k, test_doc_clustered)

In [58]:
test_doc_clustered

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12
0,17,1,2,0,0,0,1,6,0,0,3,1,0
1,22,0,0,0,0,2,2,0,1,1,2,1,0
2,24,0,0,0,0,2,1,1,0,2,1,0,0
3,10,0,0,0,1,3,0,1,2,0,0,0,2
4,1,0,0,1,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3978,5,0,0,0,0,0,2,0,0,0,0,0,0
3979,19,2,0,0,0,4,1,5,0,0,0,0,0
3980,19,0,0,0,0,0,3,2,3,0,1,0,3
3981,2,0,0,1,0,0,0,0,0,0,0,0,0


In [47]:
test_doc_clustered.shape

(3983, 13)

## Train and evaluate binary classification

In [73]:
train_binary_clf(train_doc_clustered, train_binary_labels, test_doc_clustered, test_binary_labels)


Evaluating Gaussian Naive Bayes Classifier
              precision    recall  f1-score   support

           0       0.63      0.84      0.72      2425
           1       0.46      0.22      0.30      1558

    accuracy                           0.60      3983
   macro avg       0.54      0.53      0.51      3983
weighted avg       0.56      0.60      0.55      3983

Subset accuracy = 0.5957820738137083

Evaluating Logistic Regression Classifier
              precision    recall  f1-score   support

           0       0.62      0.96      0.75      2425
           1       0.52      0.07      0.13      1558

    accuracy                           0.61      3983
   macro avg       0.57      0.51      0.44      3983
weighted avg       0.58      0.61      0.51      3983

Subset accuracy = 0.6110971629425056





Evaluating Support Vector Machine Classifier
              precision    recall  f1-score   support

           0       0.62      0.95      0.75      2425
           1       0.54      0.08      0.15      1558

    accuracy                           0.61      3983
   macro avg       0.58      0.52      0.45      3983
weighted avg       0.59      0.61      0.51      3983

Subset accuracy = 0.6138589003263871

Evaluating K-nearest Neighbors Classifier
              precision    recall  f1-score   support

           0       0.63      0.73      0.68      2425
           1       0.44      0.33      0.37      1558

    accuracy                           0.57      3983
   macro avg       0.53      0.53      0.53      3983
weighted avg       0.55      0.57      0.56      3983

Subset accuracy = 0.5731860406728596

Evaluating Random Forest Classifier
              precision    recall  f1-score   support

           0       0.63      0.80      0.70      2425
           1       0.46      0.27    

# Second Approach 

## Read data from files - Document level

In [68]:
train_doc, _, train_labels = read_data_per_doc(vocab, tr_file,tr_lbl_file)
test_doc, _, test_labels = read_data_per_doc(vocab, t_file,t_lbl_file)

## Turn docs into vectors

In [69]:
tfidf_vectorizer = TfidfVectorizer()
train_inputs = tfidf_vectorizer.fit_transform(train_doc)
test_inputs = tfidf_vectorizer.transform(test_doc)

train_inputs.shape, test_inputs.shape

((8251, 8520), (3983, 8520))

## Convert our multi-label problem to a binary classification problem

In [70]:
class_frequencies = np.sum(train_labels, axis=0)
most_frequent_class = np.argmax(class_frequencies)
train_binary_labels = multilabel_to_binary(train_labels,most_frequent_class)
test_binary_labels = multilabel_to_binary(test_labels,most_frequent_class)

## Train and evaluate binary classification

In [75]:
train_binary_clf(train_inputs.toarray(), train_binary_labels, test_inputs.toarray(), test_binary_labels)


Evaluating Gaussian Naive Bayes Classifier
              precision    recall  f1-score   support

           0       0.69      0.41      0.51      2425
           1       0.44      0.72      0.54      1558

    accuracy                           0.53      3983
   macro avg       0.56      0.56      0.53      3983
weighted avg       0.59      0.53      0.53      3983

Subset accuracy = 0.5289982425307557

Evaluating Logistic Regression Classifier
              precision    recall  f1-score   support

           0       0.69      0.87      0.77      2425
           1       0.66      0.40      0.49      1558

    accuracy                           0.68      3983
   macro avg       0.67      0.63      0.63      3983
weighted avg       0.68      0.68      0.66      3983

Subset accuracy = 0.6834044689932212

Evaluating Support Vector Machine Classifier
              precision    recall  f1-score   support

           0       0.70      0.76      0.73      2425
           1       0.57      0