In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans

In [2]:
#Read dataset
X = pd.read_csv("/Users/queene/Desktop/DM/pubmed-diabetesme-set.csv", sep=';', header=0)
X = X[pd.isna(X['PMID'])==False]
X = X[pd.isna(X['TI'])==False]
X = X[pd.isna(X['AB'])==False]
X = X[pd.isna(X['MH'])==False]

In [3]:
#Preview the file 
X.head(3)

Unnamed: 0,PMID,DP,TI,AB,MH
0,27080137,2016 Jun,Diabetes mellitus and its complications in India.,India is one of the epicentres of the global d...,"Developing Countries, Diabetes Complications/*..."
1,9803200,1998 Oct 15,Diagnosis and classification of diabetes melli...,New recommendations for the classification and...,"Blood Glucose/metabolism, Diabetes Mellitus/bl..."
2,15137354,2004 Apr-Jun,Pathophysiology of diabetes mellitus.,As we learn more about the pathophysiology of ...,"Diabetes Mellitus, Type 1/complications/*metab..."


In [4]:
#FEATURE 2: Article Ranking using Cosine Similarity 

#TFIDF Calculation
text_abstract = X['AB']
vector = TfidfVectorizer(max_df=0.3,               # Drop words occuring more than X percent of articles 
                             min_df=8,             # Only use words appear at least X times
                             stop_words='english', # Remove stop words
                             lowercase=True,       # Convert to lowercase 
                             use_idf=True,         # Use idf
                             norm=u'l2',           # Normalization
                             smooth_idf=True       # Prevents divide-by-zero errors
                            )

tfidf = vector.fit_transform(text_abstract)

In [5]:
#Search Function 
# Request function : search the top_n articles from a request ( request = string)
def search(tfidf_matrix,model,request, top_n = 5):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    return indices

# Find similar : get the top_n articles similar to an article 
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n]    

# Print the result
def print_result(request_content,indices,X):
    print('\nSearch : ' + request_content)
    print('\nBest Results :')
    for i in indices:
        print('Index = {0:5d} - Title = {1}'.format(i,X['TI'].loc[i]))
        

In [6]:
#Similar articles 
index = 1
result = find_similar(tfidf, index, top_n = 5)
print_result('04 - Abstract = Diabetes mellitus occurs commonly in the older patient and is frequently undiagnosed. In many cases, the initial manifestations of diabetes mellitus are lower extremity complications with which this disease is known to be associated. Unfortunately, these complications are similar to other age-related degenerative processes; therefore, vigilance for the detection of undiagnosed diabetes mellitus is essential in the evaluation and treatment of lower extremity pathologic conditions.',result,X)


Search : 04 - Abstract = Diabetes mellitus occurs commonly in the older patient and is frequently undiagnosed. In many cases, the initial manifestations of diabetes mellitus are lower extremity complications with which this disease is known to be associated. Unfortunately, these complications are similar to other age-related degenerative processes; therefore, vigilance for the detection of undiagnosed diabetes mellitus is essential in the evaluation and treatment of lower extremity pathologic conditions.

Best Results :
Index =    26 - Title = [Post-transplantational diabetes mellitus].
Index =    49 - Title = [Diabetes mellitus: from clinical knowledge to public health concern].
Index =    36 - Title = Diabetes: an overview.
Index =    45 - Title = [Diabetes mellitus: current classification based on cause and sharpened blood
Index =    38 - Title = Personalized medicine in diabetes.


In [None]:
#FEATURE 1: MeSH Recommendation 