In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the CSV file
file_path = '../results/llm_education_survey_paper_after_2019.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure
df.head()

# Combine relevant text columns for vectorization
df['combined_text'] = df[['title', 'abstract']].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Define query related to LLMs and CS education
query = "Large Language Models applied to Computer Science education and teaching"

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
query_vec = vectorizer.transform([query])

# Calculate cosine similarity between the query and all documents
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get top N relevant papers
N = 100
top_n_indices = cosine_similarities.argsort()[-N:][::-1]
relevant_papers = df.iloc[top_n_indices]

# get least relevant papers
least_n_indices = cosine_similarities.argsort()[:N]

# Display the relevant papers
relevant_papers

# 
least_relevant_papers = df.iloc[least_n_indices]
least_relevant_papers


Unnamed: 0,id,title,url,doi,source,year,bibtex,abstract,combined_text
10,27,The Practical Concepts of Machine Learning,https://link.springer.com/chapter/10.1007/978-...,10.1007/978-1-4842-9801-5_2,springer,2024,"@inbook{Kashyap_2023, title={The Practical Co...",,The Practical Concepts of Machine Learning
931,970,"Red teaming ChatGPT via Jailbreaking: Bias, Ro...",http://arxiv.org/pdf/2301.12867v4.pdf,10.48550/arXiv.2301.12867,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2301.1286...,,"Red teaming ChatGPT via Jailbreaking: Bias, Ro..."
935,974,Capabilities of GPT-4 on Medical Challenge Pro...,http://arxiv.org/pdf/2303.13375v2.pdf,10.48550/arXiv.2303.13375,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2303.1337...,,Capabilities of GPT-4 on Medical Challenge Pro...
936,975,GPT is becoming a Turing machine: Here are som...,http://arxiv.org/pdf/2303.14310v1.pdf,10.48550/arXiv.2303.14310,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2303.1431...,,GPT is becoming a Turing machine: Here are som...
937,976,Advances in apparent conceptual physics reason...,http://arxiv.org/pdf/2303.17012v3.pdf,10.48550/arXiv.2303.17012,arxiv,2023,@misc{https://doi.org/10.48550/arxiv.2303.1701...,,Advances in apparent conceptual physics reason...
...,...,...,...,...,...,...,...,...,...
1362,1464,An era of ChatGPT as a significant futuristic ...,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.tbench.2023.100089,science_direct,2022,"@article{HALEEM2022100089,\ntitle = {An era of...",,An era of ChatGPT as a significant futuristic ...
145,178,Chatgpt for cybersecurity: practical applicati...,https://link.springer.com/article/10.1007/s105...,10.1007/s10586-023-04124-5,springer,2023,"@article{Al_Hawawreh_2023, title={Chatgpt for...",,Chatgpt for cybersecurity: practical applicati...
147,180,To resist it or to embrace it? Examining ChatG...,https://link.springer.com/article/10.1007/s106...,10.1007/s10639-023-12146-0,springer,2023,"@article{Guo_2023, title={To resist it or to ...",,To resist it or to embrace it? Examining ChatG...
149,182,"Editorial for EAIT issue 12, 2023",https://link.springer.com/article/10.1007/s106...,10.1007/s10639-023-12367-3,springer,2023,"@article{Tatnall_2023, title={Editorial for E...",,"Editorial for EAIT issue 12, 2023"
