In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
#combine the two links
from urllib.parse import urljoin
import sqlite3
import re

In [18]:
#create a list of words to ignore
ignorewords = set(['the','of','to','and','a','in','is','it'])

class Crawler:
    #Initialize the crawler with the name of database
    def __init__(self,dbname):
        self.con = sqlite3.connect(dbname)
    #close the db
    def __del__(self):
        self.con.close()
    #提交
    def dbcommit(self):
        self.con.commit()
    #Auxilliary function for getting an entry id and adding it
    #if it's not present
    def getentryid(self,table,field,value,createnew=True):
        cur=self.con.execute("select rowid from %s where %s='%s'" %(table,field,value))
        res=cur.fetchone()
        if res==None:
            cur=self.con.execute("insert into %s (%s) values ('%s')" %(table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    #Index an individual page
    def addtoindex(self,url,soup):
        if self.isindexed(url):
            return None
        print('Indexing'+url)
        
        # Get the individual words
        text=self.gettextonly(soup)
        words=self.separatewords(text)
        # Get the URL id
        #获取url的id 调用getentryid函数，获取urlid 的rowid
        #若数据库中无此urlid的信息则insert
        urlid=self.getentryid('urllist','url',url)
        # Link each word to this url
        for i in range(len(words)):
            word=words[i]
            if word in ignorewords: 
                continue
            #获取word的id 调用getentryid函数，获取wordid 的rowid
            #若数据库中无此wordid的信息则insert
            wordid=self.getentryid('wordlist','word',word)
            #将urlid 与 wordid location(本例中里location即为第几个word)
            self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))

    #Extract the text from an HTML page(no tags)
    def gettextonly(self,soup):
        v = soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=str(subtext)+'\n'
            return resulttext
        else:
            return v.strip()
    #separate the words by any non-whitespace character
    def separatewords(self,text):
        splitter=re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']
    #Return true if this url is already indexed
    def isindexed(self,url):
        u=self.con.execute("select rowid from urllist where url='%s'"%url).fetchone()
        if u!= None:
            v=self.con.execute('select * from wordlocation where urlid=%d'%u[0]).fetchone()
            if v!=None:
                return True
        return False
    #Add a link between two pages
    def addlinkref(self,urlFrom,urlTo,linkText):
        pass
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()
    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages = set()
            for page in pages:
                try:
                    c = urlopen(page)
                except:
                    print("Could not open %s"%page)
                    continue
                soup = BeautifulSoup(c.read(),"lxml")
                #add the index for the link
                self.addtoindex(page,soup)
            
                links = soup("a")
                for link in links:
                    if('href' in dict(link.attrs)):
                        #combine the base url and the relative url
                        url = urljoin(page,link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if url[0:4] == "http" and not self.isindexed(url):
                            newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page,url,linkText)
                self.dbcommit()
            pages = newpages

In [20]:
pagelist = ["https://fr.wikipedia.org/wiki/Cergy"]
crawler = Crawler('searchindex.db')
#crawler.createindextables()
crawler.crawl(pagelist)

Indexinghttps://fr.wikipedia.org/wiki/Oise_(d%C3%A9partement)




Indexinghttps://fr.wikipedia.org/w/index.php?title=Cergy&action=edit&section=46
Indexinghttps://fr.wikipedia.org/wiki/Haute-Loire
Indexinghttps://fr.wikipedia.org/wiki/R%C3%A9seau_de_bus_Soci%C3%A9t%C3%A9_de_transports_interurbains_du_Val-d%27Oise
Indexinghttps://fr.wikipedia.org/wiki/%C3%89cole_maternelle
Indexinghttps://fr.wikipedia.org/wiki/Bastia
Indexinghttps://fr.wikipedia.org/wiki/Var_(d%C3%A9partement)
Indexinghttps://fr.wikipedia.org/wiki/Toulouse
Indexinghttps://fr.wikipedia.org/wiki/Gare_de_Cergy-Saint-Christophe
Indexinghttps://fr.wikipedia.org/wiki/Brunoy
Indexinghttps://fr.wikipedia.org/wiki/Thiais
Indexinghttps://fr.wikipedia.org/wiki/D%C3%A9mographie_de_la_France
Indexinghttps://fr.wikipedia.org/wiki/Ermont
Indexinghttps://fr.wikipedia.org/w/index.php?title=Cergy&veaction=edit&section=46
Indexinghttps://fr.wikipedia.org/w/index.php?title=Cergy&action=edit&section=44
Indexinghttps://fr.wikipedia.org/wiki/Amiens
Indexinghttps://fr.wikipedia.org/wiki/Hautes-Pyr%C3%A9n%C3%A

RecursionError: maximum recursion depth exceeded in comparison

In [21]:
class Searcher:
  def __init__(self,dbname):
    self.con=sqlite3.connect(dbname)

  def __del__(self):
    self.con.close()

  def getmatchrows(self,q):
    # Strings to build the query
    fieldlist='w0.urlid'
    tablelist=''  
    clauselist=''
    wordids=[]

    # Split the words by spaces
    words=q.split(' ')  
    tablenumber=0

    for word in words:
      # Get the word ID
      wordrow=self.con.execute(
      "select rowid from wordlist where word='%s'" % word).fetchone()
      if wordrow!=None:
        wordid=wordrow[0]
        wordids.append(wordid)
        if tablenumber>0:
          tablelist +=','
          clauselist +=' and '
          clauselist += '{0}.urlid={1}.urlid and '.format(tablenumber-1,tablenumber)
        fieldlist+='{0}.location'.format(tablenumber)
        tablelist+='wordlocation {0}'.format(tablenumber)      
        clauselist+='{0}.wordid={1}'.format(tablenumber,wordid)
        tablenumber+=1

    # Create the query from the separate parts
    fullquery='select {0} from {1} where {2}'.format(fieldlist,tablelist,clauselist)
    print(fullquery)
    cur=self.con.execute(fullquery)
    rows=[row for row in cur]

    return rows,wordids

  def getscoredlist(self,rows,wordids):
    totalscores=dict([(row[0],0) for row in rows])

    # This is where we'll put our scoring functions
    weights=[(1.0,self.locationscore(rows)), 
             (1.0,self.frequencyscore(rows)),
             (1.0,self.pagerankscore(rows)),
             (1.0,self.linktextscore(rows,wordids)),
             (5.0,self.nnscore(rows,wordids))]
    for (weight,scores) in weights:
      for url in totalscores:
        totalscores[url]+=weight*scores[url]

    return totalscores

  def geturlname(self,id):
    return self.con.execute(
    "select url from urllist where rowid=%d" % id).fetchone()[0]

  def query(self,q):
    rows,wordids=self.getmatchrows(q)
    scores=self.getscoredlist(rows,wordids)
    rankedscores=[(score,url) for (url,score) in scores.items()]
    rankedscores.sort()
    rankedscores.reverse()
    for (score,urlid) in rankedscores[0:10]:
      print('%f\t%s' % (score,self.geturlname(urlid)))
    return wordids,[r[1] for r in rankedscores[0:10]]

  def normalizescores(self,scores,smallIsBetter=0):
    vsmall=0.00001 # Avoid division by zero errors
    if smallIsBetter:
      minscore=min(scores.values())
      return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()])
    else:
      maxscore=max(scores.values())
      if maxscore==0: maxscore=vsmall
      return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

  def frequencyscore(self,rows):
    counts=dict([(row[0],0) for row in rows])
    for row in rows: counts[row[0]]+=1
    return self.normalizescores(counts)

  def locationscore(self,rows):
    locations=dict([(row[0],1000000) for row in rows])
    for row in rows:
      loc=sum(row[1:])
      if loc<locations[row[0]]: locations[row[0]]=loc
    
    return self.normalizescores(locations,smallIsBetter=1)

  def distancescore(self,rows):
    # If there's only one word, everyone wins!
    if len(rows[0])<=2: return dict([(row[0],1.0) for row in rows])

    # Initialize the dictionary with large values
    mindistance=dict([(row[0],1000000) for row in rows])

    for row in rows:
      dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
      if dist<mindistance[row[0]]: mindistance[row[0]]=dist
    return self.normalizescores(mindistance,smallIsBetter=1)

  def inboundlinkscore(self,rows):
    uniqueurls=dict([(row[0],1) for row in rows])
    inboundcount=dict([(u,self.con.execute('select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls])   
    return self.normalizescores(inboundcount)

  def linktextscore(self,rows,wordids):
    linkscores=dict([(row[0],0) for row in rows])
    for wordid in wordids:
      cur=self.con.execute('select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid)
      for (fromid,toid) in cur:
        if toid in linkscores:
          pr=self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
          linkscores[toid]+=pr
    maxscore=max(linkscores.values())
    normalizedscores=dict([(u,float(l)/maxscore) for (u,l) in linkscores.items()])
    return normalizedscores

  def pagerankscore(self,rows):
    pageranks=dict([(row[0],self.con.execute('select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows])
    maxrank=max(pageranks.values())
    normalizedscores=dict([(u,float(l)/maxrank) for (u,l) in pageranks.items()])
    return normalizedscores

  def nnscore(self,rows,wordids):
    # Get unique URL IDs as an ordered list
    urlids=[urlid for urlid in dict([(row[0],1) for row in rows])]
    nnres=mynet.getresult(wordids,urlids)
    scores=dict([(urlids[i],nnres[i]) for i in range(len(urlids))])
    return self.normalizescores(scores)

In [22]:
#crawler = crawler("searchindex.db")
#crawler.createindextable(True)
#crawler.createindextable()
#crawler.crawl(["https://en.wikipedia.org/wiki/Programming_language", "https://en.wikipedia.org/wiki/Functional_programming"])
searcher = Searcher('searchindex.db')
#crawler.con.execute('create index wordurlidx_1 on wordlocation(urlid)')
#Works badly for long queries, following for instance screws
#results = searcher.getmatchrows("Functional programming with Scala and python")

#Following doesn't work too well and returns 123689 results
q = 'Programming in Scala'
print("Searching for text '%s'" % q)
searcher.query(q)

Searching for text 'Programming in Scala'
select w0.urlid from  where 


OperationalError: near "where": syntax error

In [15]:
urlid= 111
wordid = 101
i= 00
t = "insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)

In [16]:
t

'insert into wordlocation(urlid,wordid,location) values (111,101,0)'