In [129]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [130]:
year = '2018'
data_path = f'/Users/naphat-c/Documents/naphat/Data/project/CEDT-DS-Project_LittleMermaid/ExtractedData/{year}.csv'
df = pd.read_csv(data_path)

In [131]:
df.head()

Unnamed: 0,Title,Abstract,Author,Aggregation_Type,Publisher,Publication_Date,Institutions,Keywords,Filename
0,Public health and international epidemiology f...,,Pongpirul K.; Lungren M.P.,Book,Springer International Publishing,2018-12-31,Chulalongkorn University; Johns Hopkins Bloomb...,,201800000
1,Flexible Printed Active Antenna for Digital Te...,"© 2018 The Institute of Electronics, Informati...",Pratumsiri T.; Janpugdee P.,Conference Proceeding,Institute of Electrical and Electronics Engine...,2018-12-31,Chulalongkorn University,,201800001
2,Parametric study of hydrogen production via so...,© 2018 Elsevier LtdComputational fluid dynamic...,Phuakpunk K.; Chalermsinsuwan B.; Putivisutisa...,Journal,Elsevier Ltd,2018-12-31,Chulalongkorn University; Chulalongkorn Univer...,Circulating fluidized bed; Computational fluid...,201800002
3,Superhydrophobic coating from fluoroalkylsilan...,© 2018 Elsevier B.V. A superhydrophobic/supero...,Saengkaew J.; Le D.; Samart C.; Sawada H.; Nis...,Journal,Elsevier B.V.,2018-12-31,Thammasat University; Hirosaki University; Nat...,Encapsulation; Fluoroalkylsilane; Natural rubb...,201800003
4,Electrochemical impedance-based DNA sensor usi...,© 2018 Elsevier B.V. A label-free electrochemi...,Teengam P.; Siangproh W.; Tuantranont A.; Vila...,Journal,Elsevier B.V.,2018-12-31,Chulalongkorn University; Srinakharinwirot Uni...,acpcPNA; Electrochemical impedance spectroscop...,201800004


In [132]:
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naphat-c/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/naphat-c/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [133]:
# Define preprocessing function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenization & Lowercase
    tokens = [word for word in tokens if word.isalnum()]  # Keep only alphanumeric
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Abstract'] = df['Abstract'].fillna('') 
df['Processed_Abstract'] = df['Abstract'].apply(preprocess_text)

df[['Abstract', 'Processed_Abstract']].head()


Unnamed: 0,Abstract,Processed_Abstract
0,,
1,"© 2018 The Institute of Electronics, Informati...",2018 institute electronics information communi...
2,© 2018 Elsevier LtdComputational fluid dynamic...,2018 elsevier ltdcomputational fluid dynamics ...
3,© 2018 Elsevier B.V. A superhydrophobic/supero...,2018 elsevier mesh successfully prepared simpl...
4,© 2018 Elsevier B.V. A label-free electrochemi...,2018 elsevier electrochemical dna sensor based...


In [134]:
# TF-IDF
vectorizer = TfidfVectorizer()
combined_vectors = vectorizer.fit_transform(df['Abstract'] + " " + df['Title'])  # Combine Abstract and Title for vectorization

print(f"Combined TF-IDF Matrix Shape: {combined_vectors.shape}")



Combined TF-IDF Matrix Shape: (2792, 31504)


In [135]:
inputdata = input("Enter your abstract text: ")

# Preprocess the input
processed_input = preprocess_text(inputdata)

# Vectorize the input
input_vector = vectorizer.transform([processed_input])

# Display processed input
print(f"Processed Input: {processed_input}")


Processed Input: al


In [136]:
similarities = cosine_similarity(input_vector, combined_vectors).flatten()

top_indices = similarities.argsort()[-10:][::-1]
recommended_titles = df.iloc[top_indices][['Title']]

# show top 10
n=1
print("Top 10 Recommended Titles:")
for i, row in recommended_titles.iterrows():
    print(f"{n}. {row['Title']}")
    n+=1


Top 10 Recommended Titles:
1. Reply to Chen et al
2. A novel paper-based colorimetry device for the determination of the albumin to creatinine ratio
3. The masseteric nerve: An anatomical study in Thai population with an emphasis on its use in facial reanimation
4. Enhanced Stability and Propene Yield in Propane Dehydrogenation on PtIn/Mg(Al)O Catalysts with Various In Loadings
5. Accuracy of axial length measurements obtained by optical biometry and acoustic biometry in rhegmatogenous retinal detachment: A prospective study
6. Characterization of different Si-and Al-based catalysts with pd modification and their use for catalytic dehydration of ethanol
7. Production of open-cell Al composite foams by direct casting with silica-gel beads
8. Preparation of aluminum doped zinc oxide targets and RF magnetron sputter thin films with various aluminum doping concentrations
9. AFOMP policy number 6: code of ethics for medical physicists in AFOMP Countries
10. Developement of Composite Armors 