In [9]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample clinical data
clinical_data = [
    "Patient ID: 001, Age: 45, Diagnosis: Diabetes",
    "Patient ID: 002, Age: 60, Diagnosis: Hypertension",
    "Patient ID: 003, Age: 29, Diagnosis: Diabetes",
    "Patient ID: 004, Age: 40, Diagnosis: BloodPressure",
    "Patient ID: 005, Age: 52, Diagnosis: Depression"
]

# Rule-based approach using regex
def rule_based_extraction(data):
    return [re.findall(r'Patient ID: (\d+), Age: (\d+), Diagnosis: (\w+)', record) for record in data]

# Pattern-based approach using text patterns
def pattern_based_extraction(data):
    patterns = ["Patient ID", "Age", "Diagnosis"]
    extracted_data = []
    for record in data:
        record_data = []
        for pattern in patterns:
            match = re.search(f'{pattern}: (\w+)', record)
            record_data.append(match.group(1))
        extracted_data.append(tuple(record_data))
    return extracted_data

# Machine learning approach using Naive Bayes
def machine_learning_extraction(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    y = [record.split(", ")[2].split(": ")[1] for record in data]
    clf = MultinomialNB().fit(X, y)
    predictions = clf.predict(X)
    return list(zip([record.split(", ")[0].split(": ")[1] for record in data], y, predictions))

# Extracting structured data using different approaches
rule_based_result = rule_based_extraction(clinical_data)
pattern_based_result = pattern_based_extraction(clinical_data)
machine_learning_result = machine_learning_extraction(clinical_data)

# Display results
print("Rule-based Extraction:", rule_based_result)
print("Pattern-based Extraction:", pattern_based_result)
print("Machine Learning Extraction:", machine_learning_result)

Rule-based Extraction: [[('001', '45', 'Diabetes')], [('002', '60', 'Hypertension')], [('003', '29', 'Diabetes')], [('004', '40', 'BloodPressure')], [('005', '52', 'Depression')]]
Pattern-based Extraction: [('001', '45', 'Diabetes'), ('002', '60', 'Hypertension'), ('003', '29', 'Diabetes'), ('004', '40', 'BloodPressure'), ('005', '52', 'Depression')]
Machine Learning Extraction: [('001', 'Diabetes', 'Diabetes'), ('002', 'Hypertension', 'Hypertension'), ('003', 'Diabetes', 'Diabetes'), ('004', 'BloodPressure', 'BloodPressure'), ('005', 'Depression', 'Depression')]
