## 1 Download the packages and install

In [0]:
pip install rank_bm25

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Building wheels for collected packages: rank-bm25
  Building wheel for rank-bm25 (setup.py) ... [?25l[?25hdone
  Created wheel for rank-bm25: filename=rank_bm25-0.2-cp36-none-any.whl size=4162 sha256=08eaf7f2f622490070bd25cb5374b6885a090b9ed46e28ec1d80728d7d4a514f
  Stored in directory: /root/.cache/pip/wheels/6f/0c/1f/78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
Successfully built rank-bm25
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2


In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## 2 Import the packages

In [0]:
import re
import requests
import warnings
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from string import digits
from sklearn import metrics
from sklearn.svm import SVC
from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler 

## 3 Achievement

### 3.1 Calculate and get common words

In [0]:
def create_dictionary(my_wordlist):
    word_count = {}

    for word in my_wordlist:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    c = Counter(word_count)
    # returns the most occurring elements
    top = c.most_common(20)
    print("The most frequent words are:", top)

### 3.2 Get the date

In [0]:
def my_date(my_soup):
    a = []
    for each_text_date1 in my_soup.findAll('span', {'class': 'revDate'}):
        a.append(each_text_date1.text)
    for each_text_date2 in my_soup.findAll('div', {'class': 'pubdate'}, {'class': 'h-color--green'}):
        a.append(each_text_date2.text)
    for each_text_date3 in my_soup.findAll('span', {'id': 'last-reviewed-date'}):
        a.append(each_text_date3.text)
    for each_text_date4 in my_soup.findAll('class', {'id': 'css-12r7hj0'}):
        a.append(each_text_date4.text)
    for each_text_date5 in my_soup.findAll('dd', {'id': 'date_posted'}):
        a.append(each_text_date5.text)


    if a != []:
        for s in a:
            if s is not None:
                print("Date:", s)
                split_list = []
                m = re.split('[, -]', s)
                for i in m:
                    if i.startswith('20'):
                        split_list.append(i)

        if split_list[0] is None:
            date = 2016.80882
        else:
            date = int(split_list[0])

    if a == []:
        print("Date:", None)
        date = 2016.80882
    
    return date

### 3.3 Get the author

In [0]:
def my_Author(my_soup):
    person_list = []
    for each_text_writer1 in my_soup.findAll('a', {'class': 'person'}):
        person_list.append(each_text_writer1.text)
    if person_list == []:
        print("Author:", None)
    if person_list != []:
        print("Author:", person_list)
    return len(person_list)

### 3.4 Get the reference

In [0]:
def my_reference(my_soup):
    reference_list = []
    for each_text_ref in my_soup.findAll('div', {'class': 'reference'}):
        reference_list.append(each_text_ref.text)
    for each_text_ref in my_soup.findAll('span', {'class': 'sources'}):
        reference_list.append(each_text_ref.text)
    if reference_list == []:
        print("The number of references is: ", None)
    if reference_list != []:
        print("The number of references is :", len(reference_list))
    return len(reference_list)

### 3.5 BM25 algorithm to get scores about content and problem relevance

In [0]:
def Bm25(question,clean_data):
  tokenized_corpus = [clean_data]
  bm25 = BM25Okapi(tokenized_corpus)
  query = question
  tokenized_query = query.split(" ")

  doc_scores = abs(bm25.get_scores(tokenized_query))
  print("The Bm25 score about ","'", question,"'",":",doc_scores[0])
  return doc_scores[0]

### 3.6 TF-IDF algorithm to get scores about content and problem relevance

In [0]:
def Tf_Idf(question,clean_data):
    new_data = " ".join(clean_data)
    corpus = [new_data]  

    vectorizer = CountVectorizer()  
    X = vectorizer.fit_transform(corpus)
    
    query = question
    tokenized_query = query.split(" ")
    myvocabulary = tokenized_query


    transformer = TfidfTransformer()  

    tfidf = transformer.fit_transform(X)  

    word=vectorizer.get_feature_names()
    weight=tfidf.toarray()  

    for i in range(len(weight)):
        total_score = 0
        for j in range(len(word)):

            if word[j] in myvocabulary:
                total_score = total_score + weight[i][j] 
#                 print(word[j],weight[i][j])

        print("The tf-idf total score about ","'",question,"'", ":", total_score)
    return total_score

### 3.7 PageRank algorithm to get scores about url

In [0]:
def my_pagerank(url, my_soup):
    external_link = []
    internal_link = []
    new_link = []
    d =[]
    links = [a['href'] for a in my_soup.find_all('a', href=True)]
    print("The number of links:", len(links))
    for link in links:
        exter_link = re.findall('(https?://|//www)(?:[-\w.]|(?:%[\da-fA-F]{2}))+', link)
        if exter_link:
            external_link.append(link)
        else:
            internal_link.append(link)
    print("The number of external links is: ", len(external_link))
    # print(internal_link)
    
    for i in external_link:
        ss = [url, i]
        d.append(ss)

    for m in internal_link:
        ss = [m, url]
        d.append(ss)

    edges = d
    nodes = []
    for edge in edges:
        if edge[0] not in nodes:
            nodes.append(edge[0])
        if edge[1] not in nodes:
            nodes.append(edge[1])

    N = len(nodes)

    i = 0
    node_to_num = {}
    for node in nodes:
        node_to_num[node] = i
        i += 1
    for edge in edges:
        edge[0] = node_to_num[edge[0]]
        edge[1] = node_to_num[edge[1]]

    S = np.zeros([N, N]) # make the Matrix
    for edge in edges:
        S[edge[1], edge[0]] = 1

    for j in range(N):
        sum_of_col = sum(S[:, j])
        for i in range(N):
            if sum_of_col == 0:
                S[i, j] = S[i, j]
            else:
                S[i, j] /= sum_of_col

    alpha = 0.85
    A = alpha * S + (1 - alpha) / N * np.ones([N, N])

    P_n = np.ones(N) / N
    P_n1 = np.zeros(N)

    e = 100000
    k = 0

    while e > 0.00000001:
        P_n1 = np.dot(A, P_n)
        e = P_n1 - P_n
        e = max(map(abs, e))
        P_n = P_n1
        k += 1
    print(" ")
    print('The scores of PageRank:', P_n[0])

    return len(external_link)


#### 3.7.1 Graph about relationship of nodes

In [0]:
 def draw_my_PageRank(): 
    G = nx.DiGraph()
    plt.figure(figsize=(20, 20))
    new_edges = [[0, 'external_link 0'], [0, 'external_link 1'], [0, 'external_link 2'], [0, 'external_link 3'],
             [0, 'external_link 4'], [0, 'external_link 5'], [0, 'external_link 6'], [0, 'external_link 7'],
             [0, 'external_link 8'], [0, 'external_link 9'], ['internal 0', 0], ['internal 1', 0],
             ['internal 2', 0], ['internal_3', 0], ['internal 4', 0]]
    for edge in new_edges:

        G.add_edge(edge[0], edge[1])

    nx.draw(G, with_labels=True, node_size=500, edge_vmin=2, edge_vmax=3)

    plt.show()

### 3.8 Content preprocessing and get link number

In [0]:
def my_features(q,url):
    new_count = 0
    my_wordlist = []
    my_clean_list = []
    my_source_code = requests.get(url).text
    my_soup = BeautifulSoup(my_source_code, 'html.parser')
    
    for each_text_l in my_soup.findAll('li'):
        content_l = each_text_l.text
        words_l = content_l.lower().split()
        for each_word_l in words_l:
            my_wordlist.append(each_word_l)
       
    for word in my_wordlist:
        symbols = '!&@#$%^&*()_-+={[}]|\;:"<>?/., ' # delete the Special symbol such as HTML 
        
        for i in range(0, len(symbols)):
            word = word.replace(symbols[i], '')

        if len(word) > 0:
            word1 = re.findall(r'[^\*("&/:?\\|)<>]', word, re.S) # delete the "()" and "&"
            word1 = "".join(word1)
            word1 = re.sub("^\d+\s|\s\d+\s|\s\d\w\d|\s\d+$", " ", word1) # delete the number in words
            num = word1.isdigit()
            if not num:
                my_clean_list.append(word1) # delete the number in string
    
    q = re.findall(r'[^\*("&/:?\\|)<>]', q, re.S) # delete the Special symbol such as "?" in question
    q = "".join(q)
    
    my_clean_list = [w for w in my_clean_list if w not in stopwords.words("english")] # delete the stopwords
    my_clean_list = [WordNetLemmatizer().lemmatize(w) for w in my_clean_list] # standardize different variations and distortions of a word

    
    dataset1 = pd.read_csv('heart_word.csv')
    X = dataset1.iloc[:, 0].values
    X= [x.lower() for x in X]
    for single_word in my_clean_list:
        if single_word in X:
            new_count += 1
    print(" ")
    print("-----------------------------------------------------------------------------------------------------------------------")
    if new_count == 0:
        print(" ")
        print("This webpage is not heart disease related webpage, so the final rank is 0")
        Predict_time, Predict_reference, Predict_external_link, Predict_count_words, Predict_Tf_Idf, Predict_Bm25 = 0,0,0,0,0,0
    
    if new_count != 0:
        print("The number of academic words is ",new_count, ", this webpage is heart disease related webpage")
        print(" ")
        print("The result about this url and question: ")
        print(" ")
        Predict_time = my_date(my_soup)
        my_Author(my_soup)
        Predict_reference = my_reference(my_soup)
        print("The number of words:", len(my_clean_list))
        Predict_external_link = my_pagerank(url, my_soup)
        Predict_Bm25 = Bm25(q, my_clean_list)
        Predict_Tf_Idf = Tf_Idf(q, my_clean_list)
        Predict_count_words = len(my_clean_list)
        create_dictionary(my_clean_list)
        print("-----------------------------------------------------------------------------------------------------------------------")

    return Predict_time, Predict_reference, Predict_external_link, Predict_count_words, Predict_Tf_Idf, Predict_Bm25 

### 3.9 Predict final rank

In [0]:
def my_score(q,url):
    warnings.filterwarnings("ignore")
    test_data=[]
    a,b,c,d,e,f = my_features(q,url)
 
    if a != 0:

        test_data = [[a,b,c,d,e,f]]
        test_data = np.array(test_data)

        dataset = pd.read_csv('test.csv')  
        X = dataset.iloc[:, 0:-1].values
        X= np.append(test_data,X,axis=0)
 
        min_max_scaler = MinMaxScaler()
        X = min_max_scaler.fit_transform(X)

        mm = X[0]
        mm = [mm.tolist()]
        mm = np.array(mm)
        
        XX = np.delete(X, 0, 0)

        Y = dataset.iloc[:, -1].values

        data_train, data_test, label_train, label_test = train_test_split(XX, Y, test_size=0.1, random_state=0)

        
        new_svc = SVC(kernel='linear',C= 54)
        svc_model = new_svc.fit(data_train,label_train)
        
        svc_pred = new_svc.predict(mm)
        
        print(" ")
        print("The rank of this webpage quality (The range of rank is 1 - 7):")
        print("The final rank of this webpage ", svc_pred[0])

        if svc_pred[0] >= 6:
            print("This quality of this health webpage is good")
        if 6 > svc_pred[0] >= 4:
            print("This quality of this health webpage is general")
        if svc_pred[0] < 4:
            print("This quality of this health webpage is bad")

## 4 Main function

Test question and Url:

Q: What are some of the risk factors for heart disease

1. https://www.webmd.com/heart-disease/risk-factors-for-heart-disease

2. https://www.hri.org.nz/health/learn/cardiovascular-disease/atherosclerosis?gclid=Cj0KCQiA-4nuBRCnARIsAHwyuPrugm6XLQ10jbRWlTOJ2-rHFzdyLK8QFaIdpROz7e28iWSPQVlfP0IaAqjJEALw_wcB

3. https://time.com/5388659/how-to-keep-your-heart-healthy/



In [0]:
if __name__ == '__main__':
    q = input("Please input your question about health: ")
    url1 = input("Please input the url about this question: ")
    my_score(q,url1)
#     draw_my_PageRank()

Please input your question about health: What are some of the risk factors for heart disease
Please input the url about this question: https://www.webmd.com/heart-disease/risk-factors-for-heart-disease
 
-----------------------------------------------------------------------------------------------------------------------
The number of academic words is  29 , this webpage is heart disease related webpage
 
The result about this url and question: 
 
Date:  on July 01, 2019
Author: None
The number of references is : 1
The number of words: 1184
The number of links: 315
The number of external links is:  299
 
The scores of PageRank: 1.0268368872138536e-09
The Bm25 score about  ' What are some of the risk factors for heart disease ' : 1.9396105672078172
The tf-idf total score about  ' What are some of the risk factors for heart disease ' : 0.920452743982664
The most frequent words are: [('heart', 39), ('disease', 23), ('health', 21), ('webmd', 20), ('healthy', 19), ('cholesterol', 18), ('ri