In [3]:
pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd 
import numpy as np

In [9]:
df = pd.read_csv("data_with_keywords.csv")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25292 entries, 0 to 25291
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  25292 non-null  int64  
 1   abstract    25292 non-null  object 
 2   authors     25292 non-null  object 
 3   n_citation  25292 non-null  int64  
 4   references  24827 non-null  object 
 5   title       25292 non-null  object 
 6   venue       23140 non-null  object 
 7   year        25291 non-null  float64
 8   id          25291 non-null  object 
 9   KeyWords    25292 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 1.9+ MB


In [12]:
df = df.dropna(subset=["venue"])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23140 entries, 0 to 25290
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  23140 non-null  int64  
 1   abstract    23140 non-null  object 
 2   authors     23140 non-null  object 
 3   n_citation  23140 non-null  int64  
 4   references  22738 non-null  object 
 5   title       23140 non-null  object 
 6   venue       23140 non-null  object 
 7   year        23140 non-null  float64
 8   id          23140 non-null  object 
 9   KeyWords    23140 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 1.9+ MB


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load dataset
df = pd.read_csv("data_with_keywords.csv")

# Drop rows where keywords or venue are missing
df = df.dropna(subset=["KeyWords", "venue"])

# Extract features and labels
X_text = df["KeyWords"].astype(str)
y = df["venue"].reset_index(drop=True)

# Convert keywords to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X_text)

# Fit Nearest Neighbors model (content-based similarity)
nn_model = NearestNeighbors(n_neighbors=3, metric="cosine")  # return exactly 3 closest
nn_model.fit(X_tfidf)

# 🔹 Function to recommend top 3 journals
def recommend_journals(user_keywords, top_k=3):
    # Transform input keywords into vector
    user_vec = vectorizer.transform([user_keywords])
    
    # Find nearest neighbors
    distances, indices = nn_model.kneighbors(user_vec, n_neighbors=top_k)
    
    # Get top venues
    recs = [y.iloc[i] for i in indices[0]]
    return recs

# Example usage
user_input = "cancer treatment immunotherapy clinical trials healthcare"
print("Top 3 Recommended Venues:", recommend_journals(user_input, top_k=3))

Top 3 Recommended Venues: ['international conference on engineering of complex computer systems', 'international symposium on biomedical imaging', 'Journal of the American Medical Informatics Association']


In [19]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
   ------------------------------------

In [20]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Load dataset
df = pd.read_csv("data_with_keywords.csv")

# Drop rows with missing values
df = df.dropna(subset=["KeyWords", "venue"])

# Extract text and labels
X_text = df["KeyWords"].astype(str).reset_index(drop=True)
y = df["venue"].reset_index(drop=True)

# Load embedding model (MiniLM is lightweight & fast)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert keywords to embeddings
X_embeddings = model.encode(X_text, convert_to_tensor=False)

# Fit Nearest Neighbors
nn_model = NearestNeighbors(n_neighbors=3, metric="cosine")
nn_model.fit(X_embeddings)

# 🔹 Function to recommend top journals
def recommend_journals(user_keywords, top_k=3):
    user_vec = model.encode([user_keywords], convert_to_tensor=False)
    distances, indices = nn_model.kneighbors(user_vec, n_neighbors=top_k)
    recs = [(y.iloc[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]  # cosine similarity = 1 - distance
    return recs

# Example medical input
user_input = "cancer treatment immunotherapy clinical trials healthcare"
print("Top 3 Recommended Venues:", recommend_journals(user_input, top_k=3))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top 3 Recommended Venues: [('Journal of Clinical Bioinformatics', np.float32(0.3271358)), ('Sensors', np.float32(0.31966954)), ('Nucleic Acids Research', np.float32(0.2803802))]


In [21]:
import pickle

# Save nearest neighbors model
with open("nn_model.pkl", "wb") as f:
    pickle.dump(nn_model, f)

# Save y labels
with open("venues.pkl", "wb") as f:
    pickle.dump(y, f)

In [24]:
!winget install --id Git.Git -e --source winget


   - 
   \ 
   | 
   / 
                                                                                                                        

  â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’  1024 KB / 2.25 MB
  â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–’â–’â–’â–’  2.00 MB / 2.25 MB
  â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  2.25 MB / 2.25 MB
                                                                                                                        

  â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’  0%
  â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’  0%
  â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’  1%
  â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’â–’  2%
  â–ˆâ–

In [26]:
!git --version

'git' is not recognized as an internal or external command,
operable program or batch file.
