In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, confusion_matrix

# This block imports the necessary libraries for data manipulation, text processing, machine learning models, and evaluation metrics.

In [None]:
# Function to iteratively classify documents using the Active Learning strategy
def classify_priority_documents(texts, model, tfidf_vectorizer, max_non_relevant_docs=50, labeled_documents=set()):
    classified_documents = pd.DataFrame(columns=['Abstract', 'label', 'predicted'])  # Adding the 'predicted' column
    non_relevant_consecutive = 0

    while len(texts) > 0:
        # Filter unlabeled documents
        unlabeled_texts = texts[~texts['Abstract'].isin(labeled_documents)]
        
        # Check if there are unlabeled documents
        if len(unlabeled_texts) == 0:
            break
        
        # Select priority documents among the unlabeled
        priority_texts, priority_indices = select_priority_documents(unlabeled_texts['Abstract'], model, tfidf_vectorizer, 10)
        
        # Present priority documents to the user one by one
        for i, text in enumerate(priority_texts, start=1):
            print(f"\nDocument {i}:")
            print(text)
            print()
            
            # Ask the user to label the document
            label = input("Is this document relevant? (y/n): ").lower()

            # Check if the user provided a valid response
            while label not in ['y', 'n', '']:
                print("Please respond with 'y' for yes, 'n' for no, or press Enter to skip.")
                label = input("Is this document relevant? (y/n): ").lower()

            # If the user presses Enter, the document will be ignored
            if label == '':
                print("Document ignored.")
                continue

            # Add the document, label, and prediction to the classified data
            prediction = model.predict(tfidf_vectorizer.transform([text]))[0]  # Model prediction
            classified_documents = classified_documents.append({'Abstract': text, 'label': label, 'predicted': prediction}, ignore_index=True)
            labeled_documents.add(text)  # Add document to labeled
            texts = texts.drop(texts.index[priority_indices[i-1]])  # Remove classified document from remaining texts

            # Count non-relevant documents
            if label == 'n':
                non_relevant_consecutive += 1
                if non_relevant_consecutive >= max_non_relevant_docs:
                    print(f"\nReached the limit of {max_non_relevant_docs} consecutive non-relevant documents. Stopping the process.")
                    break
            else:
                non_relevant_consecutive = 0

        if non_relevant_consecutive >= max_non_relevant_docs:
            break

    return classified_documents

# Function to select priority documents
def select_priority_documents(texts, model, tfidf_vectorizer, num_docs):
    # Calculate class probabilities for the unlabeled documents
    class_probabilities = model.predict_proba(tfidf_vectorizer.transform(texts))
    # Select indices of documents with the highest probability of being relevant
    priority_indices = class_probabilities[:, 1].argsort()[::-1][:num_docs]
    # Return the selected texts and indices
    return texts.iloc[priority_indices], priority_indices

def calculate_recall(classification_results, threshold=0.5):
    # Convert predictions to discrete labels using the decision threshold
    classification_results['predicted'] = classification_results['predicted'].map({1: 'y', 0: 'n'})
    # Calculate classification metrics
    recall = recall_score(classification_results['label'], classification_results['predicted'], average='binary', pos_label='y')
    tn, fp, fn, tp = confusion_matrix(classification_results['label'], classification_results['predicted']).ravel()
    return recall, tp, fp, tn, fn

# This block contains all the functions defined for the classification process, including classifying priority documents, selecting priority documents, and calculating recall.

In [None]:
# Load the data of manually labeled studies from a CSV file
training_data = pd.read_csv('file.csv')
training_data.dropna(subset=['label'], inplace=True)
training_texts = training_data['Abstract']
training_labels = training_data['label']

# Extract features from texts using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(training_texts) 

# In this block, the data is loaded from a CSV file and prepared for training.

In [None]:
# Train the Logistic Regression model with the training data
logistic_model = LogisticRegression(class_weight='balanced')
logistic_model.fit(X_train_tfidf, training_labels)

# Train the Naive Bayes model with the training data
naive_bayes_model = MultinomialNB(alpha=3.822)
naive_bayes_model.fit(X_train_tfidf, training_labels)

# Load the data of unlabeled studies from a CSV file
unlabeled_data = pd.read_csv('file.csv')

# Set the threshold of consecutive non-relevant documents
max_non_relevant_docs = 50

# Call the function to classify priority documents with the Logistic Regression model
logistic_classification_results = classify_priority_documents(unlabeled_data, logistic_model, tfidf_vectorizer, max_non_relevant_docs)

# Save the classified results to CSV files
logistic_classification_results.to_csv('results_lr.csv', index=False)

# Call the function to classify priority documents with the Naive Bayes model
naive_bayes_classification_results = classify_priority_documents(unlabeled_data, naive_bayes_model, tfidf_vectorizer, max_non_relevant_docs)

# Save the classified results to CSV files
naive_bayes_classification_results.to_csv('results_nb.csv', index=False)

# This block contains the creation and training of logistic regression and naive bayes models based on the training data. The threshold for non-relevant documents is set to stop classification. Finally, the unlabeled documents are loaded and classified using the trained models, with the results saved to CSV files.

In [None]:
# Evaluation of recall for each algorithm
recall_logistic, tp_logistic, fp_logistic, tn_logistic, fn_logistic = calculate_recall(logistic_classification_results)
recall_naive_bayes, tp_naive_bayes, fp_naive_bayes, tn_naive_bayes, fn_naive_bayes = calculate_recall(naive_bayes_classification_results)

# Print the values
print("Recall for Logistic Regression:", recall_logistic)
print("True Positives (TP) for Logistic Regression:", tp_logistic)
print("False Positives (FP) for Logistic Regression:", fp_logistic)
print("True Negatives (TN) for Logistic Regression:", tn_logistic)
print("False Negatives (FN) for Logistic Regression:", fn_logistic)
print()
print("Recall for Naive Bayes:", recall_naive_bayes)
print("True Positives (TP) for Naive Bayes:", tp_naive_bayes)
print("False Positives (FP) for Naive Bayes:", fp_naive_bayes)
print("True Negatives (TN) for Naive Bayes:", tn_naive_bayes)
print("False Negatives (FN) for Naive Bayes:", fn_naive_bayes)

# Print documents classified as relevant by each model
print("Documents classified as relevant by the Logistic Regression model:")
print(logistic_classification_results[logistic_classification_results['label'] == 'y']['Abstract'])
print("Documents classified as relevant by the Naive Bayes model:")
print(naive_bayes_classification_results[naive_bayes_classification_results['label'] == 'y']['Abstract'])

# This block provides an analysis of the model performance and the documents classified as relevant by each of them. Additionally, it prints the documents classified as relevant by each model.