In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the CSV file
file_path = '../results/llm_education_survey_paper_after_2019.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure
df.head()

# Combine relevant text columns for vectorization
df['combined_text'] = df[['title', 'abstract']].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Define query related to LLMs and CS education
query = "Large Language Models and machine learning applied to Computer Science education and teaching. Pedagogies and methodologies for teaching computer science students using LLMs."

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
query_vec = vectorizer.transform([query])

# Calculate cosine similarity between the query and all documents
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get top N relevant papers
N = 500
top_n_indices = cosine_similarities.argsort()[-N:][::-1]
relevant_papers = df.iloc[top_n_indices]

# get least relevant papers
least_n_indices = cosine_similarities.argsort()[:N]

# Display the relevant papers
relevant_papers

# 
least_relevant_papers = df.iloc[least_n_indices]
least_relevant_papers


Unnamed: 0,id,title,url,doi,source,year,bibtex,abstract,combined_text
22,40,Philosophical and Social Realm,https://link.springer.com/chapter/10.1007/978-...,10.1007/978-3-031-35331-4_2,springer,2023,"@inbook{Aber_ek_2023, title={Philosophical an...",,Philosophical and Social Realm
1287,1390,Automating Human Tutor-Style Programming Feedb...,http://arxiv.org/pdf/2310.03780v3.pdf,10.48550/arXiv.2310.03780,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2310.0378...,,Automating Human Tutor-Style Programming Feedb...
1291,1394,Impact of Guidance and Interaction Strategies ...,http://arxiv.org/pdf/2310.13712v2.pdf,10.48550/arXiv.2310.13712,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2310.1371...,,Impact of Guidance and Interaction Strategies ...
1298,1401,Prompt Problems: A New Programming Exercise fo...,http://arxiv.org/pdf/2311.05943v1.pdf,10.48550/arXiv.2311.05943,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2311.0594...,,Prompt Problems: A New Programming Exercise fo...
1302,1405,Kattis vs. ChatGPT: Assessment and Evaluation ...,http://arxiv.org/pdf/2312.01109v1.pdf,10.48550/arXiv.2312.01109,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2312.0110...,,Kattis vs. ChatGPT: Assessment and Evaluation ...
...,...,...,...,...,...,...,...,...,...
1053,1155,Interview with Mariusz Pisarski,https://doi.org/10.1145/3643603.3643606,10.1145/3643603.3643606,acm,2024,"@article{10.1145/3643603.3643606,\nauthor = {A...",,Interview with Mariusz Pisarski
1056,1158,Co-designing a knowledge management tool for e...,https://doi.org/10.1145/3643834.3660682,10.1145/3643834.3660682,acm,2024,"@inproceedings{10.1145/3643834.3660682,\nautho...",,Co-designing a knowledge management tool for e...
1057,1159,Artificial Dreams: Surreal Visual Storytelling...,https://doi.org/10.1145/3643834.3660685,10.1145/3643834.3660685,acm,2024,"@inproceedings{10.1145/3643834.3660685,\nautho...",,Artificial Dreams: Surreal Visual Storytelling...
1058,1160,The Power of Absence: Thinking with Archival T...,https://doi.org/10.1145/3643834.3660690,10.1145/3643834.3660690,acm,2024,"@inproceedings{10.1145/3643834.3660690,\nautho...",,The Power of Absence: Thinking with Archival T...
