# Use TF-IDF to identify (possibly) irrelevant papers

In [33]:
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the CSV file
# Connect to the SQLite database
conn = sqlite3.connect('../../coding_website/db_llm_education_survey.sqlite3')

# Create a cursor object
cursor = conn.cursor()

# Execute a SQL query
cursor.execute('SELECT llm_education_survey_paper.id, title, abstract, source FROM llm_education_survey_paper JOIN '
               'llm_education_survey_analysis ON llm_education_survey_paper.id = llm_education_survey_analysis.paper_id '
               'WHERE is_relevant IS NULL AND user_id = "1"')

# fetch all the results in a data frame
rows = cursor.fetchall()
df = pd.DataFrame(rows, columns=['id', 'title','abstract', 'source'])

# Combine relevant text columns for vectorization
df['combined_text'] = df[['title', 'abstract']].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Define query related to LLMs and CS education
query = "Large Language Models and machine learning applied to Computer Science education and teaching. Pedagogies and methodologies for teaching computer science students using LLMs."

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
query_vec = vectorizer.transform([query])

# Calculate cosine similarity between the query and all documents
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get top N relevant papers
N = 500
top_n_indices = cosine_similarities.argsort()[-N:][::-1]
relevant_papers = df.iloc[top_n_indices]

# get least relevant papers
least_n_indices = cosine_similarities.argsort()[:100]

# Display the relevant papers


# 
least_relevant_papers = df.iloc[least_n_indices]
# exclude papers with terms like "call for papers" or "conference"
least_relevant_papers = least_relevant_papers[~least_relevant_papers['combined_text'].str.contains('teach|student|educat|pedagog|classroom|course|CS1', case=False)]


least_relevant_papers


Unnamed: 0,id,title,abstract,source,combined_text
3,30,So What’s the Plan? Mining Strategic Planning ...,,springer,So What’s the Plan? Mining Strategic Planning ...
7,38,"Prolog: Past, Present, and Future",,springer,"Prolog: Past, Present, and Future"
6,37,"Reflections on Automation, Learnability and Ex...",,springer,"Reflections on Automation, Learnability and Ex..."
37,90,1 \(^{st}\) Workshop on Information Retrieval ...,,springer,1 \(^{st}\) Workshop on Information Retrieval ...
60,131,Understanding ChatGPT’s Underlying Technology,,springer,Understanding ChatGPT’s Underlying Technology
...,...,...,...,...,...
735,1324,Developing a deep learning natural language pr...,The detection of adverse drug reactions (ADRs)...,science_direct,Developing a deep learning natural language pr...
720,1277,Continuous agile cyber–physical systems archit...,"Modern cyber-physical systems, for the most pa...",science_direct,Continuous agile cyber–physical systems archit...
231,416,MASCARA : Systematically Generating Memorable ...,Passwords are the most common mechanism for au...,acm,MASCARA : Systematically Generating Memorable ...
305,532,“They only care to show us the wheelchair”: di...,This paper reports on disability representatio...,acm,“They only care to show us the wheelchair”: di...


# Export all papers to CSV

In [32]:
# Create a cursor object
cursor = conn.cursor()

# Execute a SQL query
cursor.execute('SELECT llm_education_survey_paper.id, title, abstract, source, is_relevant FROM llm_education_survey_paper JOIN '
               'llm_education_survey_analysis ON llm_education_survey_paper.id = llm_education_survey_analysis.paper_id '
               'WHERE user_id = "1"')

# fetch all the results in a data frame
df_all = pd.DataFrame(cursor.fetchall(), columns=['id', 'title','abstract', 'source', 'is_relevant'])
# change data type of is_relevant to string
df_all['is_relevant'] = df_all['is_relevant'].astype(str)
# fill na as empty string

df_all.to_csv('../data/stage1_all_papers.csv', index=False)
