In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import packages
import requests
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ES_URL = 'https://inex:qatc2011@guacamole.univ-avignon.fr/dblp1/_search'
VECTOR_DB_URL = 'https://guacamole.univ-avignon.fr/stvir_test'
HEADERS = {'Content-Type': 'application/json'}

In [None]:
# Function to query ElasticSearch
def query_elasticsearch(query, size=100):
    response = requests.get(f"{ES_URL}?q={query}&size={size}", auth=('inex', 'qatc2011'))
    if response.status_code == 200:
        return response.json()['hits']['hits']
    else:
        print("Failed to fetch data:", response.status_code)
        return []

In [None]:
# Function to retrieve vector embeddings
def get_vector_embeddings(phrase, length=100):
    payload = {'corpus': 'abstract', 'phrase': phrase, 'length': length}
    response = requests.get(VECTOR_DB_URL, params=payload)
    if response.status_code == 200:
        return response.json()
    else:
        print("Failed to fetch embeddings:", response.status_code)
        return []

In [None]:
# Function to calculate relevance scores
def calculate_relevance(docs, query):
    # Convert docs to a DataFrame
    doc_texts = [doc['_source']['abstract'] for doc in docs if 'abstract' in doc['_source']]
    df = pd.DataFrame(doc_texts, columns=['text'])

    # Use TF-IDF to vectorize texts
    vectorizer = TfidfVectorizer(stop_words='english')
    doc_vectors = vectorizer.fit_transform(df['text'])
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity
    cos_similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    return cos_similarities

In [None]:
def flesch_kincaid_grade_level(text):
    # Constants for the formula
    ASL = average_sentence_length(text)
    ASW = average_syllables_per_word(text)

    # Calculating the score
    score = 0.39 * ASL + 11.8 * ASW - 15.59

    # Normalize score to range from 0 to 1
    normalized_score = normalize(score, min_score=0, max_score=25)  # 20+ is academic level texts

    return normalized_score

def normalize(value, min_score, max_score):
    # Normalize value to range from 0 to 1
    return (value - min_score) / (max_score - min_score) if max_score != min_score else 0.5

def average_sentence_length(text):
    sentences = text.split('.')
    num_sentences = len(sentences)
    words = text.split()
    num_words = len(words)

    return num_words / num_sentences

def average_syllables_per_word(text):
    words = text.split()
    total_syllables = 0
    for word in words:
        total_syllables += count_syllables(word)

    return total_syllables / len(words)

def count_syllables(word):
    # Rough estimation of syllable count in a word
    count = 0
    vowels = 'aeiouAEIOU'
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    return count

# Example usage
text = "Trials to introduce artificial intelligence (AI) in clinical settings have been done for several decades, but the movement toward such introduction remains slow. In the past, AI systems were mainly to support physicians. They were \u201drule-based\u201d and specifically designed to assist in diagnosis or to recommend drugs to be prescribed to patients. Current clinical medicine is not performed by a physician acting alone, but through cooperation between staff with various occupations. Kimura Information Technology Co., Ltd. (KIT, Japan) has built a system named \u201dAI-Q\u201d that works on the Japanese version of IBM\u2019s Watson and with which it is possible to build arbitrary problem solving systems. AI-Q was made to serve a variety of purposes, and a system for pharmacists has been built for drug information. In this paper, we illustrate how practical applications of AI can be designed for use by medical staff other than physicians and discuss how the system can be extended to other fields. We converted an AI system previously used to support pharmacists into one for certified clinical engineers (CCE). The purpose of this paper is to give the background of the system for CCE and to evaluate it."
print("Normalized Flesch-Kincaid Grade Level score:", flesch_kincaid_grade_level(text))


Normalized Flesch-Kincaid Grade Level score: 0.43190103626943016


In [None]:
# Function to format results
def format_results(docs, scores, topic_id, query_id):
    results = []
    for doc, score in zip(docs, scores):
        if score > 0.15:  # Threshold for including the document
            result = {
                "run_id": "Tomislav&Rowan_Task1_TFIDF",
                "manual": 0,
                "topic_id": topic_id,
                "query_id": query_id,
                "doc_id": doc['_id'],
                "rel_score": round(score, 2),
                "comb_score": round(flesch_kincaid_grade_level(doc['_source']['abstract']),2),
                "passage": doc['_source']['abstract']
            }
            results.append(result)
    return results

In [None]:
import pandas as pd

def main():
    # Read queries from JSON file into a dataframe
    queries = pd.read_json('/content/drive/MyDrive/BIP/SimpleText/task 1/task 1/topics_qrels/simpletext_2024_task1_queries.json')
    queries = queries.head(5)

    all_results = []
    for index, query_row in queries.iterrows():
        query_text = query_row['query_text']
        topic_id = query_row['topic_id']
        query_id = query_row['query_id']

        docs = query_elasticsearch(query_text)
        scores = calculate_relevance(docs, query_text)
        results = format_results(docs, scores,topic_id, query_id)
        all_results.extend(results)

    # Output results to a JSON file
    with open('results.json', 'w') as f:
        json.dump(all_results, f, indent=4)

    print("Task completed. Results stored in 'results.json'.")

if __name__ == '__main__':
    main()


Task completed. Results stored in 'results.json'.


In [None]:
json_file_path = "results.json"

# Open and read the JSON file
with open(json_file_path, "r") as file:
    results = json.load(file)

# Display the contents of the JSON file
print(results)