#imports

In [1]:
import pandas as pd
import re
import itertools
import nltk
import numpy as np
import os
import scipy
import pickle



from nltk.tokenize import TweetTokenizer

from tqdm import tqdm
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import KFold

import logging

#read data

In [2]:
healthcare_workers_df = pd.read_csv('hpc_train.csv')


#pre

In [12]:
target_field = "Occupation_Type (HCP/Not HCP)"
mapping_dict = {
    "Hcp": 1,
    "Not hcp": 0
}
inverse_mapping_dict = {v: k for k, v in mapping_dict.items()}
healthcare_workers_df['author_type_numeric'] = healthcare_workers_df[target_field].map(mapping_dict)
healthcare_workers_df['author_full_name_and_description'] = healthcare_workers_df['author_full_name'].str.cat(healthcare_workers_df['description'],sep=" ")
healthcare_workers_df = healthcare_workers_df[(healthcare_workers_df['Occupation_Type (HCP/Not HCP)'] == 'Hcp') | (healthcare_workers_df['Occupation_Type (HCP/Not HCP)'] == 'Not hcp') ]
healthcare_workers_df = healthcare_workers_df.replace(np.nan, '', regex=True)

In [13]:
labeled_df = healthcare_workers_df[(healthcare_workers_df["author_type_numeric"] == 0) | 
                              (healthcare_workers_df["author_type_numeric"] == 1)]
healthcare_workers_df = healthcare_workers_df.reset_index()
labeled_indexes = labeled_df.index


# y = healthcare_workers_df['author_type_numeric']
# # y =healthcare_workers_df['author_type_numeric'].apply(lambda n: int(n))
# y = y.astype('int')
# X = healthcare_workers_df['author_full_name_and_description']

In [None]:
X = healthcare_workers_balanced_df['author_full_name_and_description']

y = healthcare_workers_balanced_df['author_type_numeric']
# y =healthcare_workers_df['author_type_numeric'].apply(lambda n: int(n))
y = y.astype('int')

In [None]:
# Separate the instances for each class
df_class_0 = healthcare_workers_df.loc[healthcare_workers_df['author_type_numeric'] == 0]
df_class_1 = healthcare_workers_df.loc[healthcare_workers_df['author_type_numeric'] == 1]

# Get the number of instances for each class
count_class_0, count_class_1 = df_class_0.shape[0], df_class_1.shape[0]

# Determine the size of the subset to select from the majority class
subset_size = count_class_0

# Sample a subset of instances from the majority class
df_class_1_subset = df_class_1.sample(n=subset_size, random_state=42)

# Concatenate the subset of instances from the majority class with all instances from the minority class
healthcare_workers_balanced_df = pd.concat([df_class_1_subset, df_class_0], axis=0)

# Shuffle the dataframe to mix the instances of each class
healthcare_workers_balanced_df = healthcare_workers_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
healthcare_workers_balanced_df

In [None]:
healthcare_workers_balanced_df['author_type_numeric'].value_counts()


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [None]:
train_df = pd.DataFrame({})
train_df["text"] =X# X_train
train_df["labels"] =y# y_train

#model

In [None]:
!pip install --upgrade transformers


In [6]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.CRITICAL)

In [None]:
model_args = ClassificationArgs(num_train_epochs=1)

In [None]:
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args, use_cuda=False
)


In [None]:
 #Train the model
model.train_model(train_df)

In [None]:
eval_df = pd.DataFrame({})
eval_df["text"] = X_test
eval_df["labels"] = y_test
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)


In [None]:
accuracy = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Save the trained model using pickle
with open('saved_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load the saved model using pickle
with open('saved_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

#K-FOLD

In [None]:
X = healthcare_workers_balanced_df['author_full_name_and_description']
y = healthcare_workers_balanced_df['author_type_numeric']


# Set up cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Loop through each fold
accuracy_scores = []
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Training on fold {fold_idx}...")

    # Split the data into training and testing sets for this fold
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)
    # Initialize the model
    model = ClassificationModel(
      "roberta", "roberta-base", args=model_args, use_cuda=False
    )

    # Train the model on the training set for this fold
    train_df = pd.DataFrame({"text": X_train, "labels": y_train})
    model.train_model(train_df)

    # Evaluate the model on the test set for this fold
    eval_df = pd.DataFrame({"text": X_test, "labels": y_test})
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    accuracy = (result['tp'] + result['tn']) / (result['tp'] + result['tn'] + result['fp'] + result['fn'])
    accuracy_scores.append(accuracy)
    print(f"Accuracy on fold {fold_idx}: {accuracy:.2f}")

# Print the average accuracy over all folds
print(f"Average accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Evaluate the model on the evaluation dataset
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

# Get the predicted labels and true labels
predictions = model_outputs.argmax(axis=1)
y_true = eval_df['labels'].tolist()

# Calculate the confusion matrix
cm = confusion_matrix(y_true, predictions)

# Visualize the confusion matrix using seaborn
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [9]:
import pickle
# Load the saved model using pickle
with open('saved_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Example binary classification predictions and true labels
y_true = healthcare_workers_balanced_df['author_type_numeric']
# y_scores = []

# for i,row in healthcare_workers_balanced_df.iterrows():
#     pr, raw_outputs = loaded_model.predict([row['author_full_name_and_description']])
#     prob = np.exp(raw_outputs) / np.sum(np.exp(raw_outputs), axis=1)
#     max_index = np.argmax(prob)

#     y_scores.append(prob[0][1])
#     if i%100==0:
#         print(i)

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

# Plot precision-recall curve
plt.plot(recall, precision, color='blue', label='Precision-recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
import matplotlib.pyplot as plt

# Example binary classification predictions and true labels
y_true = healthcare_workers_balanced_df['author_type_numeric']


# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

# Compute F1 score for each threshold
f1_scores = 2 * precision * recall / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

# Plot precision-recall curve
plt.plot(recall, precision, color='blue', label='Precision-recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

print("Best threshold: {:.4f}".format(best_threshold))


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
import matplotlib.pyplot as plt

# Example binary classification predictions and true labels
y_true = healthcare_workers_balanced_df['author_type_numeric']


# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

# Compute F1 score for each threshold
f1_scores = 2 * precision * recall / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

# Plot precision-recall curve
plt.plot(recall, precision, color='blue', label='Precision-recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

print("Best threshold: {:.4f}".format(best_threshold))


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()



fpr, tpr, thresholds = roc_curve(y_true, y_scores)

print(roc_auc_score(y_true, y_scores))
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
plot_roc_curve(fpr, tpr)