In [1]:
# getting all the links from document
import requests
api_url = 'https://en.wikipedia.org/w/api.php' #base URL for the Web API of the English Wikipedia

some_params={'action': 'query',
            'titles': 'Web design',
            'prop': 'links', 
            'pllimit':'50',  # setting the number of links to 50.. can be replaced by any number
            'format': 'json'}
result = requests.get(url=api_url, params=some_params).json()
links = result['query']['pages']['34035']['links']

In [2]:
# stopwords removal and stemming...

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')

def remove_stopwords(exintro):
    stop_words = set(stopwords.words('english'))
    words = exintro.split(' ')
    article = ''
    for w in words:
        if w not in stop_words:
            article += w + ' '
    return article.strip()        
    
def stemming(exintro):
    ps = PorterStemmer()
    words = exintro.split(' ')
    article = ''
    for w in words:
        w_ = ps.stem(w)
        article += w_+' '
    return article.strip() 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sandipansikdar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# extract intro section, perform stopword removal and stemming and store them in respective files in the directory 'docs'

for l in links:
    title = l['title']
    params ={'action': 'query','titles': title,'prop': 'extracts','exintro': 'True','explaintext': 'True', 'format': 'json'}
    request = requests.get(url=api_url, params = params).json()
    page_id = list(request['query']['pages'].keys())[0]
    extract = request['query']['pages'][page_id]['extract']
    extract = remove_stopwords(extract)
    extract = stemming(extract)
    if len(extract)>0:
        with open('docs/'+title.replace('.',''),'w') as ft:
            ft.write(extract) 

In [5]:
# obtain term-frequency matrix
import numpy as np
import os
word2index = {}
document2index = {}
index2document = {}
document_word_vectors = {}
w_cnt = 0
d_cnt = 0
for root, dirs, files in os.walk('docs'):
    for f in files:
        document_word_vectors[f] = []
        document2index[f] = d_cnt
        index2document[d_cnt] = f
        d_cnt+=1
        with open(root+'/'+f) as fs:
            try:
                for line in fs:
                    words = line.strip().split()
                    for w in words:
                        if w not in word2index:
                            word2index[w] = w_cnt
                            w_cnt+=1
                        document_word_vectors[f].append(word2index[w])
            except:
                print (f)
                        
w_f_matrix = np.zeros((len(word2index),len(document2index)))
for doc in document_word_vectors:
    i = document2index[doc]
    for j in document_word_vectors[doc]:
        w_f_matrix[j,i]+=1 

In [6]:
w_f_matrix

array([[4., 0., 1., ..., 2., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [6., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [7]:
# preprocssing the query vector
q = 'web development design'
q = remove_stopwords(q)
q = stemming(q)
query = []
q_v = np.zeros(len(word2index))
for w in q.split():
    q_v[word2index[w]]+=1
    query.append(word2index[w])
max_ = np.max(q_v)
def normalize_query(i,max_):
    return 0.5+(0.5*i)/max_
norm_q = np.vectorize(normalize_query)
q_v = norm_q(q_v,max_)

In [8]:
#defining the similarity function
from scipy import spatial
def similarity(l_1,l_2):
    return 1 - spatial.distance.cosine(l_1, l_2)

In [9]:
# obtain normalized term-frequency matrix
t_f = np.copy(w_f_matrix)
max_f = np.zeros(len(document2index))
for i in range(len(document2index)):
    max_f[i] = np.max(t_f[:,i])
t_f = np.divide(t_f,max_f)  

In [10]:
def find_top_k(k,doc_sim,index2document):
    for doc,sim in sorted(doc_sim.items(),key = lambda x:x[1], reverse=True):
        print (index2document[doc])
        k-=1
        if k==0:
            break
def find_k_relevant_documents(t_f,q_v,k,index2document):
    doc_sim = {}
    for i in range(t_f.shape[1]):
        sim = similarity(t_f[:,i],q_v)
        doc_sim[i] = sim
    find_top_k(k,doc_sim,index2document)            

In [11]:
# top 5 documents based on only term frequency
find_k_relevant_documents(t_f,q_v,5,index2document)

Blog
Acid3
Architect-led design–build
Advertising
CERN


In [14]:
# obtaining tf-idf matrix
inv_doc_freq = np.count_nonzero(t_f,axis=1)
def normalize(a,x):
    return np.log(x/a)
norm = np.vectorize(normalize)
inv_doc_freq = norm(inv_doc_freq,len(document2index))
tf_idf = np.multiply(t_f,inv_doc_freq.reshape(-1,1))

In [15]:
tf_idf

array([[0.16762898, 0.        , 0.20953622, ..., 0.20953622, 0.        ,
        0.        ],
       [0.7613325 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.522665  , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.95166562],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.95166562],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.95166562]])

In [16]:
# normalizing the query
q_v = np.multiply(q_v,inv_doc_freq)

In [17]:
# top 5 documents by tf-idf
find_k_relevant_documents(tf_idf,q_v,5,index2document)

Blog
Advertising
Architect-led design–build
Chartered Society of Designers
Acid3


In [18]:
# statistical language model
t_f = np.copy(w_f_matrix)
def statistical_language_model(query, t_f, k, index2document):
    doc_similarity = {}
    for i in range(t_f.shape[1]):
        w_sum = np.sum(t_f[:,i])
        sim = 1
        for w in query:
            sim*=t_f[w,i]/w_sum
        doc_similarity[i] = sim
    find_top_k(k,doc_similarity,index2document)

In [19]:
# top 5 documents by statistical language model
statistical_language_model(query, t_f,5,index2document)

Acid1
Acid2
Chartered Society of Designers
Body text
Affective design


In [20]:
# latent semantic indexing (LSI)
from numpy import linalg
def latent_semantic_indexing(query, t_f, k, topk, index2document):
    doc_similarity = {}
    u,s,v = linalg.svd(t_f)
    u = u[:,:k]
    s_ = np.zeros((k,k))
    for i in range(k):
        s_[i,i] = s[i]
    v = v[:k,:]
    q_v = np.zeros(len(word2index))
    for q in query:
        q_v[q]+=1
    q_v = q_v.reshape(1,-1)
    q_v = np.matmul(q_v,u)
    s_ = linalg.inv(s_)
    q_v = np.matmul(q_v,s_)
    
    for i in range(v.shape[1]):
        sim = similarity(q_v,v[:,i])
        doc_similarity[i] = sim 
    find_top_k(topk, doc_similarity,index2document)

In [21]:
# top 5 documents by statistical language model (LSI)
latent_semantic_indexing(query, t_f, 3, 5, index2document)

Blueprint
Color theory
Algorithms-Aided Design (AAD)
Adaptive web design
Boiler design
