In [21]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report,accuracy_score, precision_score, recall_score,f1_score

In [22]:
# Variable declarations

embeddings_list = []
label_list = []
unique_label_name = []
accuracy_list = []
# model_list = [
#     ('Logistic Regression', LogisticRegression()),      
#     ('K Nearest Neighbor', KNeighborsClassifier()),
#     ('Random Forest', RandomForestClassifier(random_state=3))]

# pipeline = Pipeline(model_list)

model_dict = {'K Nearest Neighbor': KNeighborsClassifier(n_neighbors= 3 , weights = 'distance'),
              'Logistic Regression': LogisticRegression(),
             'Random Forest': RandomForestClassifier(random_state=3,max_depth=20, criterion = "entropy"),
             'Decsision Tree': DecisionTreeClassifier(random_state=3, max_depth=20)}

model_comparison_df = pd.DataFrame(columns = ['model_name', 'train_accuracy', 'test_accuracy_score', 'precision_score', 'recall_score', 'f1_score'])


In [23]:
def get_labels_and_embeddings():
    data_folder = "Embeddings/Embeddings--bge-large-en"
    global embeddings_list
    if not os.path.exists(data_folder):
        print("Error: Data folder does not exists.")

    data = [os.path.join(data_folder, file) for file in os.listdir(data_folder)]
    
    for filepath in data: 
        data = filepath.split('/')
        label_name = data[-1].removesuffix(".npy")
        unique_label_name.append(label_name)
        with open (filepath, 'rb') as fp:
            embeddings = np.load(fp)
            embeddings_list.append(embeddings)
            size = len(embeddings)
            for label in range(0, size):
                label_list.append(label_name)

    embeddings_list = np.concatenate(embeddings_list, axis = 0)

    return embeddings_list   

In [24]:
    data_folder = "Embeddings/Embeddings--bge-large-en"
    data = [os.path.join(data_folder, file) for file in os.listdir(data_folder)]
    data

['Embeddings/Embeddings--bge-large-en/Earth Sciences.npy',
 'Embeddings/Embeddings--bge-large-en/Pharmacy.npy',
 'Embeddings/Embeddings--bge-large-en/Environment.npy',
 'Embeddings/Embeddings--bge-large-en/Mathematics.npy',
 'Embeddings/Embeddings--bge-large-en/Life Sciences.npy',
 'Embeddings/Embeddings--bge-large-en/Philosophy.npy',
 'Embeddings/Embeddings--bge-large-en/Economics.npy',
 'Embeddings/Embeddings--bge-large-en/Law.npy',
 'Embeddings/Embeddings--bge-large-en/Biomedicine.npy',
 'Embeddings/Embeddings--bge-large-en/Computer Science.npy',
 'Embeddings/Embeddings--bge-large-en/Literature.npy',
 'Embeddings/Embeddings--bge-large-en/Materials Science.npy',
 'Embeddings/Embeddings--bge-large-en/Chemistry.npy',
 'Embeddings/Embeddings--bge-large-en/Statistics.npy',
 'Embeddings/Embeddings--bge-large-en/History.npy',
 'Embeddings/Embeddings--bge-large-en/Social Sciences.npy',
 'Embeddings/Embeddings--bge-large-en/Political Science and International Relations.npy',
 'Embeddings/Emb

In [25]:
def train_model():
    X_train, X_test = train_test_split(embeddings_list, test_size = 0.2, random_state=42)
    y_train, y_test = train_test_split(label_list, test_size=0.2, random_state=42)
    model_name, train_accuracy, test_accuracy, precision, recall, f1score = [], [], [], [], [], []

    for name, model in model_dict.items():
        model_name.append(name)

        model.fit(X_train, y_train)        
        y_pred = model.predict(X_test)  

        train_accuracy.append(model.score(X_train, y_train))
        test_accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred, average='macro'))
        recall.append(recall_score(y_test, y_pred, average='macro'))
        f1score.append(f1_score(y_test, y_pred, average='macro'))

    model_comparison_df = pd.DataFrame(data=zip(model_name,train_accuracy,test_accuracy,precision,recall,f1score),
                                       columns = ['model_name', 'train_accuracy', 'test_accuracy_score', 'precision_score', 'recall_score', 'f1_score']) 
    
    model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
        
    return model_comparison_df  

In [None]:
    discipline_embeddings = get_labels_and_embeddings()
    print(len(discipline_embeddings))
    print(unique_label_name)

    model_comparison_df = train_model()
    print(model_comparison_df)

181322
['Earth Sciences', 'Pharmacy', 'Environment', 'Mathematics', 'Life Sciences', 'Philosophy', 'Economics', 'Law', 'Biomedicine', 'Computer Science', 'Literature', 'Materials Science', 'Chemistry', 'Statistics', 'History', 'Social Sciences', 'Political Science and International Relations', 'Psychology', 'Physics', 'Geography', 'Engineering', 'Medicine & Public Health', 'Business and Management', 'Education']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
with open("model_comparison_df.pkl", "wb") as fout:
    pickle.dump(model_comparison_df, fout)

In [None]:
with open("model_comparison_df.pkl", "rb") as fout:
    df = pickle.load(fout)

In [None]:
df.head()