In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import numpy as np
import pandas as pd
import json


def read_json_to_df(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)


# Read each file into a separate DataFrame
df_test = read_json_to_df('E:/Desktop/Text/Task2/test.json')

# Compute similarities
query_disease = "delirium"


# Convert numbers to strings (processing tokens and tags)
for df in [df_train, df_valid, df_test]:
    df['tokens'] = df['tokens'].apply(lambda x: [str(i) for i in x])
    df['tags'] = df['tags'].apply(lambda x: [str(i) for i in x])


# Extract unique diseases from test set
def extract_unique_diseases(df):
    unique_diseases = set()
    for _, row in df.iterrows():
        tokens = row['tokens']
        tags = row['tags']
        current_disease = []
        inside_disease = False
        for token, tag in zip(tokens, tags):
            if tag == '2':  # Beginning of Disease
                if current_disease:  # Add previous disease if exists
                    unique_diseases.add(" ".join(current_disease))
                    current_disease = []
                inside_disease = True
                current_disease.append(token)
            elif tag == '3' and inside_disease:  # Inside Disease
                current_disease.append(token)
            else:
                if current_disease:
                    unique_diseases.add(" ".join(current_disease))
                    current_disease = []
                inside_disease = False
        # Add any remaining disease
        if current_disease:
            unique_diseases.add(" ".join(current_disease))
    return unique_diseases

unique_diseases = extract_unique_diseases(df_test)



def jaccard_sim(str1, str2):
    vec = CountVectorizer().fit_transform([str1, str2])
    vec = vec.toarray()
    return cosine_similarity([vec[0]], [vec[1]])[0][0]


def levenshtein_sim(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()


jaccard_similarities = [(disease, jaccard_sim(query_disease, disease)) for disease in unique_diseases]
levenshtein_similarities = [(disease, levenshtein_sim(query_disease, disease)) for disease in unique_diseases]

# Sort diseases by similarity for both measures
jaccard_similarities_sorted = sorted(jaccard_similarities, key=lambda x: x[1], reverse=True)
levenshtein_similarities_sorted = sorted(levenshtein_similarities, key=lambda x: x[1], reverse=True)

# Top 5 similar and dissimilar diseases for both measures
jaccard_similar_diseases = jaccard_similarities_sorted[:5]
jaccard_dissimilar_diseases = jaccard_similarities_sorted[-5:]
levenshtein_similar_diseases = levenshtein_similarities_sorted[:5]
levenshtein_dissimilar_diseases = levenshtein_similarities_sorted[-5:]

# Output results
print("Jaccard - Top 5 similar diseases:", jaccard_similar_diseases)
print("Jaccard - Top 5 dissimilar diseases:", jaccard_dissimilar_diseases)
print("Levenshtein - Top 5 similar diseases:", levenshtein_similar_diseases)
print("Levenshtein - Top 5 dissimilar diseases:", levenshtein_dissimilar_diseases)


Jaccard - Top 5 similar diseases: [('delirium', 1.0), ('Delirium', 1.0), ('Postoperative delirium', 0.7071067811865475), ('postoperative delirium', 0.7071067811865475), ('endometrial cancer', 0.0)]
Jaccard - Top 5 dissimilar diseases: [('hypoparathyroidism', 0.0), ('leukemic', 0.0), ('Scleroderma renal crisis', 0.0), ('hyperglycemia', 0.0), ('acute myocardial infarction', 0.0)]
Levenshtein - Top 5 similar diseases: [('delirium', 1.0), ('Delirium', 0.875), ('Postoperative delirium', 0.5333333333333333), ('postoperative delirium', 0.5333333333333333), ('peliosis', 0.5)]
Levenshtein - Top 5 dissimilar diseases: [('NCSE', 0.0), ('LIDs', 0.0), ('ICH', 0.0), ('SAH', 0.0), ('ESRD', 0.0)]
