In [18]:
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the CSV file
# Connect to the SQLite database
conn = sqlite3.connect('../../coding_website/db_llm_education_survey.sqlite3')

# Create a cursor object
cursor = conn.cursor()

# Execute a SQL query
cursor.execute('SELECT llm_education_survey_paper.id, title, abstract, source FROM llm_education_survey_paper JOIN '
               'llm_education_survey_analysis ON llm_education_survey_paper.id = llm_education_survey_analysis.paper_id '
               'WHERE is_relevant IS NULL AND user_id = "1"')

# fetch all the results in a data frame
rows = cursor.fetchall()
df = pd.DataFrame(rows, columns=['id', 'title','abstract', 'source'])

# Combine relevant text columns for vectorization
df['combined_text'] = df[['title', 'abstract']].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Define query related to LLMs and CS education
query = "Large Language Models and machine learning applied to Computer Science education and teaching. Pedagogies and methodologies for teaching computer science students using LLMs."

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
query_vec = vectorizer.transform([query])

# Calculate cosine similarity between the query and all documents
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get top N relevant papers
N = 500
top_n_indices = cosine_similarities.argsort()[-N:][::-1]
relevant_papers = df.iloc[top_n_indices]

# get least relevant papers
least_n_indices = cosine_similarities.argsort()[:100]

# Display the relevant papers


# 
least_relevant_papers = df.iloc[least_n_indices]
least_relevant_papers


Unnamed: 0,id,title,abstract,source,combined_text
40,90,1 \(^{st}\) Workshop on Information Retrieval ...,,springer,1 \(^{st}\) Workshop on Information Retrieval ...
20,54,Comparative Quality Analysis of GPT-Based Mult...,,springer,Comparative Quality Analysis of GPT-Based Mult...
50,112,Exploring GPT-4 as MR Sequence and Reconstruct...,,springer,Exploring GPT-4 as MR Sequence and Reconstruct...
23,59,Assessing ChatGPT’s Proficiency in CS1-Level P...,,springer,Assessing ChatGPT’s Proficiency in CS1-Level P...
495,731,Simulating the Human in HCD with ChatGPT: Rede...,,acm,Simulating the Human in HCD with ChatGPT: Rede...
...,...,...,...,...,...
469,705,Evolving Roles and Workflows of Creative Pract...,"Creative practitioners (like designers, softwa...",acm,Evolving Roles and Workflows of Creative Pract...
437,665,Conversational Interfaces in IoT Ecosystems: W...,"In the last few years, text and voice-based co...",acm,Conversational Interfaces in IoT Ecosystems: W...
590,901,FormaT5: Abstention and Examples for Condition...,Formatting is an important property in tables ...,acm,FormaT5: Abstention and Examples for Condition...
392,614,"A Journey of a 1,000 Kernels Begins with a Sin...","We are in age of AI, with rapidly changing alg...",acm,"A Journey of a 1,000 Kernels Begins with a Sin..."
