In [1]:
import pandas as pd

In [None]:
healthcare_workers_df = pd.read_csv('hpc_train_1529.csv')


In [None]:
import numpy as np
target_field = "Occupation_Type (HCP/Not HCP)"
mapping_dict = {
    "Hcp": 1,
    "Not hcp": 0
}
inverse_mapping_dict = {v: k for k, v in mapping_dict.items()}
healthcare_workers_df['author_type_numeric'] = healthcare_workers_df[target_field].map(mapping_dict)
healthcare_workers_df['author_full_name_and_description'] = healthcare_workers_df['author_full_name'].str.cat(healthcare_workers_df['description'],sep=" ")
healthcare_workers_df = healthcare_workers_df[(healthcare_workers_df['Occupation_Type (HCP/Not HCP)'] == 'Hcp') | (healthcare_workers_df['Occupation_Type (HCP/Not HCP)'] == 'Not hcp') ]
healthcare_workers_df = healthcare_workers_df.replace(np.nan, '', regex=True)

In [None]:
import pandas as pd

# Separate the instances for each class
df_class_0 = healthcare_workers_df.loc[healthcare_workers_df['author_type_numeric'] == 0]
df_class_1 = healthcare_workers_df.loc[healthcare_workers_df['author_type_numeric'] == 1]

# Get the number of instances for each class
count_class_0, count_class_1 = df_class_0.shape[0], df_class_1.shape[0]

# Determine the size of the subset to select from the majority class
subset_size = count_class_0

# Sample a subset of instances from the majority class
df_class_1_subset = df_class_1.sample(n=subset_size, random_state=42)

# Concatenate the subset of instances from the majority class with all instances from the minority class
healthcare_workers_balanced_df = pd.concat([df_class_1_subset, df_class_0], axis=0)

# Shuffle the dataframe to mix the instances of each class
healthcare_workers_balanced_df = healthcare_workers_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
healthcare_workers_balanced_df

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
# test = .sample(15)
# test.reset_index(drop=True, inplace=True)
X = healthcare_workers_balanced_df['author_full_name_and_description']
y = healthcare_workers_balanced_df['author_type_numeric']

# Loop through each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
# Set up cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Training on fold {fold_idx}...")
    
    # Split the data into training and testing sets for this fold
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    
    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)
    
    # Initialize the model
    model = ClassificationModel(
      "roberta", "roberta-base", args=model_args, use_cuda=False
    )

    # Train the model on the training set for this fold
    train_df = pd.DataFrame({"text": X_train, "labels": y_train})
    model.train_model(train_df)

    # Evaluate the model on the test set for this fold
    eval_df = pd.DataFrame({"text": X_test, "labels": y_test})
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    
    # Calculate evaluation metrics
    y_true = eval_df["labels"]
    y_pred = np.argmax(model_outputs, axis=1)
    accuracy = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    
    print(f"Accuracy on fold {fold_idx}: {accuracy:.2f}")
    print(f"Precision on fold {fold_idx}: {precision:.2f}")
    print(f"Recall on fold {fold_idx}: {recall:.2f}")
    print(f"F1-score on fold {fold_idx}: {f1:.2f}")
    
    pd.DataFrame({
        'accuracy': accuracy_scores,
        'precision': precision_scores,
        'recall': recall_scores,
        'f1_score': f1_scores
    }).to_csv('results_1.csv')

# Print the average evaluation metrics over all folds
print(f"Average accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}")
print(f"Average precision: {sum(precision_scores) / len(precision_scores):.2f}")
print(f"Average recall: {sum(recall_scores) / len(recall_scores):.2f}")
print(f"Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}")
