Twitter

In [1]:
import os, sys; 
#sys.path.insert(0, ".", "..", ".."))

from pattern.web import Twitter, hashtags
from pattern.db  import Datasheet, pprint, pd

# This example retrieves tweets containing given keywords from Twitter.

try: 
    # We'll store tweets in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each tweet.
    # We only want to add the latest tweets, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already exists.
    # The pd() function returns the parent directory of this script + any given path.
    table = Datasheet.load(pd("cool.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print i
    for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
        print
        print tweet.text
        print tweet.author
        print tweet.date
        print hashtags(tweet.text) # Keywords in tweets start with a "#".
        print
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...

pprint(table, truncate=100)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")


0

RT @MMP_UTK: Oh Me,Oh My! Your Contest #3, Kenneth D. Richmond, is cooler than a Coogi Sweater &amp; exudes style and charm in any type of weat‚Ä¶
OohsoGenie
Wed Oct 26 23:20:13 +0000 2016
[u'#3']


RT @LaytonEWilliams: Post-work hootenanny at @Sojourners. My work is cooler than your work. #SojoLife #mypeople https://t.co/ZIzDQPoA9W
Sojourners
Wed Oct 26 23:17:49 +0000 2016
[u'#SojoLife', u'#mypeople']


The NBA is 5,000 times cooler than the NFL. #PACERNATION
sbizila
Wed Oct 26 23:13:50 +0000 2016
[u'#PACERNATION']


@tornado__ii This bird is cooler than me
Tiramiswhat
Wed Oct 26 23:13:17 +0000 2016
[]


My dog is definitely cooler than me https://t.co/GBHjEtBjvy
ldy3r97
Wed Oct 26 23:07:54 +0000 2016
[]


RT @ChiBuildings: How is it possible that the new sections of the riverwalk are cooler than last year's new sections? https://t.co/FfCNPGaD‚Ä¶
ski7days
Wed Oct 26 23:03:51 +0000 2016
[]


How is it possible that the new sections of the riverwalk are cooler than last year's new se

Google

In [2]:
import os, sys

from pattern.web import Google, plaintext
from pattern.web import SEARCH

# The pattern.web module has a SearchEngine class,
# with a SearchEngine.search() method that yields a list of Result objects.
# Each Result has url, title, text, language, author and date and properties.
# Subclasses of SearchEngine include: 
# Google, Bing, Yahoo, Twitter, Facebook, Wikipedia, Wiktionary, Flickr, ...

# This example retrieves results from Google based on a given query.
# The Google search engine can handle SEARCH type searches.
# Other search engines may also handle IMAGE, NEWS, ...

# Google's "Custom Search API" is a paid service.
# The pattern.web module uses a test account by default,
# with a 100 free queries per day shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
# You should obtain your own license key at: 
# https://code.google.com/apis/console/
# Activate "Custom Search API" under "Services" and get the key under "API Access".
# Then use Google(license=[YOUR_KEY]).search().
# This will give you 100 personal free queries, or 5$ per 1000 queries.
engine = Google(license=None, language="en")

# Veale & Hao's method for finding similes using wildcards (*):
# http://afflatus.ucd.ie/Papers/LearningFigurative_CogSci07.pdf
# This will match results such as:
# - "as light as a feather",
# - "as cute as a cupcake",
# - "as drunk as a lord",
# - "as snug as a bug", etc.
q = "as * as a *"

# Google is very fast but you can only get up to 100 (10x10) results per query.
for i in range(1, 2):
    for result in engine.search(q, start=i, count=10, type=SEARCH, cached=True):
        print plaintext(result.text) # plaintext() removes all HTML formatting.
        print result.url
        print result.date
        print

AS similes such as 'as big as a bus'. Vocabulary for ESL ...
https://www.englishclub.com/vocabulary/figures-similes-list.htm


A $100 million 10-year investment in conjunction with e-commerce giant Alibaba
will establish men's and women's leagues as well as a sevens programme.
http://en.as.com/


Film speed is the measure of a photographic film's sensitivity to light, determined
by ...... as well as a user-adjustable SOS value. In all cases, the camera should ...
https://en.wikipedia.org/wiki/Film_speed


... While renewable energy is extremely popular among the German public, power
lines are not.
http://www.nytimes.com/2015/12/08/business/energy-environment/change-isnt-as-easy-as-a-flip-of-a-switch.html
Dec 7, 2015

Asafoetida /√¶s…ôÀàf…õt·µªd…ô/ is the dried latex (gum oleoresin) exuded from the
rhizome or tap root ..... foetida, F. narthex, F. rubricaulis (Hing; Devil's Dung;
Asafoetida) as well as a discussion of health benefits and usage in clinical
practice.
https://en.wikipedia.

Google translate... 
HTTP401Authentication: Google translate API is a paid service

Crawler is nice. combine with html parsing stuff

In [5]:
import os, sys

from pattern.web import Crawler, DEPTH, BREADTH, FIFO, LIFO

# This example demonstrates how to use the Crawler class for web crawling.

# -------------------------------------------------------------------------------------------------
# First, we need a subclass of Crawler with its own Crawler.visit() method.
# The visit() method takes two parameters: the visited link and the HTML source.
# We could parse the HTML DOM to extract information we need, for example.
# Anything that is not HTML (e.g., a JPEG file) is passed to Crawler.fail().

class SimpleCrawler1(Crawler):
    
    def visit(self, link, source=None):
        print "visiting:", link.url, "from:", link.referrer
        
    def fail(self, link):
        print "failed:", link.url

# Create a new crawler.
# 1) The links parameter is a list of URL's to visit.
#    The crawler will visit the first link, extract new links from the HTML, and queue these for a visit too.
# 2) The domains parameter is a list of allowed domains.
#    The crawler will never leave these domains.
# 3) The delay parameter specifies a number of seconds to wait before revisiting the same domain.
#    In the meantime, other queued links will be crawled if possible.

crawler1 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], domains=["ua.ac.be"], delay=0.0)

print "CRAWLER 1 " + "-" * 50
while len(crawler1.visited) < 5: # Crawler.visited is a dictionary of all URL's visited so far.
    # The Crawler.crawl() method has the same optional parameters as URL.download(),
    # for example: cached=True, proxy=("proxy.com", "https"), ...
    crawler1.crawl(cached=False)

# -------------------------------------------------------------------------------------------------
# Typically, you'll want a crawler that runs in an endless loop as a background process,
# and just keeps on visiting new URL's. In this case, it is rude to use a delay of 0.0,
# because you will keep hammering servers with automated requests.
# A higher delay (in a real-world scenario, say 30 seconds) is better:

crawler2 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], domains=["ua.ac.be"], delay=0.1)

print
print "CRAWLER 2 " + "-" * 50
while True:
    crawler2.crawl(cached=False)
    print "wait..."
    # Of course we don't want this example to run forever,
    # so we still add a stop condition:
    if len(crawler2.visited) > 2:
        break

# -------------------------------------------------------------------------------------------------
# If you create a crawler without a domains=[..] restriction, it is free to roam the entire web.
# What to visit first? You can use Crawler.crawl() with an optional "method" parameter.
# When set to DEPTH, it prefers to visit links in the same domain.
# When set to BREADTH, it prefers to visit links to other domains.
# Observe the difference between crawler3 and crawler4,
# which use DEPTH and BREADTH respectively.

crawler3 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.0)

print
print "CRAWLER 3 " + "-" * 50
while len(crawler3.visited) < 3:
    crawler3.crawl(method=DEPTH)
    
crawler4 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.0)

print
print "CRAWLER 4 " + "-" * 50
while len(crawler4.visited) < 3:
    crawler4.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# With Crawler.crawl(method=DEPTH) and a delay,
# the crawler will wait between requests to the same domain.
# In the meantime, it will visit other links.
# Usually this means that it will alternate between a couple of domains:

crawler5 = SimpleCrawler1(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.1)

print
print "CRAWLER 5 " + "-" * 50
while len(crawler5.visited) < 4:
    crawler5.crawl(method=DEPTH)

# -------------------------------------------------------------------------------------------------
# A BREADTH-crawler in an endless crawl loop will eventually queue the entire web for a visit.
# But this is not possible of course: we can't keep the entire web in memory.
# When the number of queued links exceeds Crawler.QUEUE (10,000 by default),
# less relevant queued links will be discarded.
# "Less relevant" depends on two settings:
# 1) First, there is the Crawler.priority() method that returns a number between 0.0-1.0 for a link.
#    Links with a higher priority are more relevant and will be visited sooner.
# 2) Links with an equal priority are queued either FIFO or LIFO.
#    FIFO means first-in-first-out: the earliest queued links will be visited sooner.
#    LIFO means last-in-first-out: more recently queued links will be visited sooner.

class SimpleCrawler2(Crawler):
    
    def visit(self, link, source=None):
        print "visiting:", link.url, "from:", link.referrer
    
    def priority(self, link, method=DEPTH):
        if "?" in link.url:
            # This ignores links with a querystring.
            return 0.0
        else:
            # Otherwise use the default priority ranker,
            # i.e. the priority depends on DEPTH or BREADTH crawl mode.
            return Crawler.priority(self, link, method)

# Note the LIFO sort order. 
# This will make more recently queued links more relevant.
# If you observe the given URL in a browser,
# you'll notice that the last external link at the bottom of the page is now visited first.
crawler6 = SimpleCrawler2(links=["http://www.clips.ua.ac.be/pages/pattern/"], delay=0.1, sort=LIFO)

print
print "CRAWLER 6 " + "-" * 50
while len(crawler6.visited) < 4:
    crawler6.crawl(method=BREADTH)

# -------------------------------------------------------------------------------------------------
# In the long run, the Crawler.visited dictionary will start filling up memory too.
# If you want a single crawler that runs forever, you should empty the dictionary every now and then,
# and instead use a strategy with a persistent database of visited links,
# in combination with Crawler.follow().
# Another strategy would be to use different DEPTH-crawlers for different domains,
# and delete them when they are done.

CRAWLER 1 --------------------------------------------------
visiting: http://www.clips.ua.ac.be/pages/pattern/ from: 
visiting: http://www.clips.ua.ac.be/pages/pattern#navigation from: http://www.clips.ua.ac.be/pages/pattern/
visiting: http://www.clips.ua.ac.be/ from: http://www.clips.ua.ac.be/pages/pattern/
failed: http://www.clips.ua.ac.be/media/pattern-2.6.zip
failed: http://www.clips.ua.ac.be/media/pattern-2.5.zip

CRAWLER 2 --------------------------------------------------
visiting: http://www.clips.ua.ac.be/pages/pattern/ from: 
wait...
visiting: http://www.clips.ua.ac.be/pages/pattern#navigation from: http://www.clips.ua.ac.be/pages/pattern/
wait...
visiting: http://www.clips.ua.ac.be/ from: http://www.clips.ua.ac.be/pages/pattern/
wait...

CRAWLER 3 --------------------------------------------------
visiting: http://www.clips.ua.ac.be/pages/pattern/ from: 
visiting: http://www.clips.ua.ac.be/pages/pattern#navigation from: http://www.clips.ua.ac.be/pages/pattern/
visiting: htt