# Libraries & Functions

In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_predict

In [5]:
def calculate_scores(y_test, y_pred, average = "binary"):
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), average = average)
    return [accuracy, precision, recall, f1, auc]

# Input Data

In [6]:
raw_datasets = dict()

## POWO Dataset

In [7]:
working_dir = "..//Datasets//" 

df_POWO_Fam =  pd.read_excel(working_dir + "POWO_Family.xlsx")
df_POWO_Fam_Preproc = df_POWO_Fam.drop_duplicates(subset = ["BERT_description"])
df_POWO_Fam_Preproc = df_POWO_Fam_Preproc[df_POWO_Fam_Preproc["BERT_description"].apply(lambda x: len(x.split(" ")))>10]

POWO_Filter = df_POWO_Fam_Preproc["family"].value_counts().keys().values[:10]
df_POWO_Fam_Preproc = df_POWO_Fam_Preproc[df_POWO_Fam_Preproc["family"].apply(lambda x: x in POWO_Filter)].groupby('family', group_keys=False).apply(lambda x: x.sample(500, random_state = 42))
raw_datasets["POWO"] = df_POWO_Fam_Preproc

## WIKI Dataset

In [8]:
def fix_WIKI(name, description):
    for n in name.split(" "):
        description = str(description).replace(n.lower(), "")
    return description.strip()

In [9]:
working_dir = "..//Datasets//" 

df_WIKI_Fam =  pd.read_excel(working_dir + "WIKI_Family.xlsx")
df_WIKI_Fam_Preproc = df_WIKI_Fam.drop_duplicates(subset = ["BERT_description"])
df_WIKI_Fam_Preproc["BERT_description"] = df_WIKI_Fam_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)
df_WIKI_Fam_Preproc = df_WIKI_Fam_Preproc[df_WIKI_Fam_Preproc["BERT_description"].apply(lambda x: len(str(x).split(" ")))>10]

WIKI_Filter = df_WIKI_Fam_Preproc["family"].value_counts().keys().values[:10]
df_WIKI_Fam_Preproc = df_WIKI_Fam_Preproc[df_WIKI_Fam_Preproc["family"].apply(lambda x: x in WIKI_Filter)].groupby('family', group_keys=False).apply(lambda x: x.sample(500, random_state = 42))
raw_datasets["WIKI"] = df_WIKI_Fam_Preproc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_WIKI_Fam_Preproc["BERT_description"] = df_WIKI_Fam_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)


## Preprocess Data

In [10]:
print("Number of families per dataset:")
for dataset_name in list(raw_datasets.keys()):
    print("\t", dataset_name, raw_datasets[dataset_name]["family"].nunique())

Number of families per dataset:
	 POWO 10
	 WIKI 10


In [11]:
preprocessed_datasets_dict = {}

In [12]:
for dataset_name in list(raw_datasets.keys()):
    labelencoder = LabelEncoder()
    raw_datasets[dataset_name]["family_encoded"] = labelencoder.fit_transform(raw_datasets[dataset_name]["family"])

    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
    = train_test_split(raw_datasets[dataset_name]["BERT_description"], raw_datasets[dataset_name]["family_encoded"], np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    preprocessed_datasets_dict[dataset_name] = DatasetDict()
    preprocessed_datasets_dict[dataset_name]["train"] = prepare_data(X_train, y_train)
    preprocessed_datasets_dict[dataset_name]["validation"] = prepare_data(X_test, y_test)

In [13]:
for dataset_name in list(raw_datasets.keys()):
    print(dataset_name)
    print(preprocessed_datasets_dict[dataset_name])
    print()

POWO
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1250
    })
})

WIKI
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['text', 'label', 'idx'],
        num_rows: 1250
    })
})



# Classical ML Models

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

results_list = []

for dataset_name in list(raw_datasets.keys()):
    print(dataset_name)
    X_train, X_test, \
    y_train, y_test, \
    indices_train, indices_test \
          = train_test_split(raw_datasets[dataset_name]["BOW_description"], raw_datasets[dataset_name]["family"].values, np.arange(len(raw_datasets[dataset_name])), test_size=0.25, random_state=42)

    vectorizer = CountVectorizer(max_features = 1000)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    lr_mod = LogisticRegression(max_iter=1000, penalty='l2', class_weight='balanced').fit(X_train, y_train)
    y_predict = lr_mod.predict(X_test)

    results = calculate_scores(y_test, y_predict, average = "macro")

    results_list.append([dataset_name] + results + ["Logistic Regression"])
      
df_results_lr = pd.DataFrame(results_list, columns=["Dataset", "Accuracy", "Precision", "Recall", "F1-Score", "AUC", "Model"])

POWO
WIKI


In [16]:
df_results_lr

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1-Score,AUC,Model
0,POWO,0.9624,0.963219,0.963338,0.963144,0.979572,Logistic Regression
1,WIKI,0.9616,0.961331,0.961633,0.961337,0.978687,Logistic Regression


In [17]:
df_results_lr.to_excel("Results//FamilyClassification_LR_Results.xlsx", index = False)