Skip to content

Commit

Permalink
made twitter_search parser use Tweepy subsystem
Browse files Browse the repository at this point in the history
  • Loading branch information
Vhati committed Oct 10, 2012
1 parent 92e3f7d commit 4ecb4e2
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 90 deletions.
1 change: 0 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@
parser_options["twitter_search.reply_name"] = "MockTM"
parser_options["twitter_search.since_date"] = datetime(2012, 9, 13)
parser_options["twitter_search.until_date"] = datetime(2012, 9, 15)
parser_options["twitter_search.passes"] = 1

# Exporter-specific options.
exporter_options = {}
Expand Down
9 changes: 4 additions & 5 deletions lib/parsers/twitter_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

def get_description():
return ("Collects snarks from your Twitter screen name and @mentions.\n"+
"This is much more reliable than a plain search,\n"+
"but it only works for your own Twitter account.\n"+
"This is much more reliable than a plain search, "+
"but it only works for your own account.\n"+
"On first use, it will prompt for authorization.")

def get_arginfo():
Expand All @@ -38,7 +38,7 @@ def get_arginfo():
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
"""Collects snarks from your Twitter screen name and @mentions.
This is much more reliable than a plain search, but it only
works for your own Twitter account.
works for your own account.
This parser adds non-standard attributes to snarks:
"user_url" and "msg_url", links to the user's twitter
Expand Down Expand Up @@ -102,7 +102,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
results = tweepy_func(**tweepy_func_args)
api_hp -= 1
if (not results):
print "No more results"
done = True
break
else:
Expand All @@ -124,7 +123,7 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu

snark["date"] = status.created_at

snark["user_url"] = "http://www.twitter.com/%s" % status.author.screen_name
snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(status.author.screen_name)
snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(status.author.screen_name), status.id)

if (until_date and snark["date"] > until_date):
Expand Down
189 changes: 106 additions & 83 deletions lib/parsers/twitter_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,22 @@
from lib import arginfo
from lib import common
from lib import global_config
from lib.subsystems import tweepy_backend


# Namespace for options.
ns = "twitter_search."

# Names of lib.subsystem modules that should be set up in advance.
required_subsystems = []
required_subsystems = ["tweepy_backend"]


def get_description():
return ("Collects snarks from a Twitter search.\n"+
"Finds tweets from an account and any @reply mentions of it.\n\n"+
"Note: Twitter's search API only reaches back a few days.")
"Finds tweets from any account and @reply mentions of it.\n\n"+
"Note: Twitter's search API only reaches back a few days "+
"and may be incomplete.\n"+
"On first use, it will prompt for authorization.")

def get_arginfo():
args = []
Expand All @@ -35,28 +38,20 @@ def get_arginfo():
args.append(arginfo.Arg(name="until_date", type=arginfo.DATETIME,
required=False, default=None, choices=None, multiple=False,
description="UTC date to limit dredging up new tweets."))
args.append(arginfo.Arg(name="passes", type=arginfo.INTEGER,
required=False, default=1, choices=None, multiple=False,
description="Search X times to fill omissions in results."))
return args

def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
"""Collects snarks from a Twitter search. Finds
tweets from an account and any @reply mentions of it.
tweets from any account and @reply mentions of it.
See: https://dev.twitter.com/docs/api/1/get/search
This parser adds non-standard attributes to snarks:
"user_url" and "msg_url", links to the user's twitter
page and to the specific tweet. Exporters might
disregard this info.
Any given search may be incomplete, but this parser can
make multiple passes to mitigate that. It's recommended
that you initially search and export to a temporary
pickled_snarks file; then parse THAT repeatedly to apply
adjustments and export to the final desired format.
Twitter's search API only reaches back a few days. :/
Twitter's search API only reaches back a few days
and may be incomplete. :/
:param src_path: Not used.
:param first_msg: If not None, ignore comments prior to one containing this substring.
Expand All @@ -67,8 +62,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
UTC Datetime to limit dredging up old tweets.
until_date (optional):
UTC Datetime to limit dredging up new tweets.
passes (optional):
Search X times to fill omissions in results.
:param keep_alive_func: Optional replacement to get an abort boolean.
:param sleep_func: Optional replacement to sleep N seconds.
:return: A List of snark dicts.
Expand All @@ -77,8 +70,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
if (sleep_func is None): sleep_func = global_config.nap

search_url = "http://search.twitter.com/search.json"

since_date = None
if (ns+"since_date" in options and options[ns+"since_date"]):
since_date = options[ns+"since_date"]
Expand All @@ -87,83 +78,115 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
if (ns+"until_date" in options and options[ns+"until_date"]):
until_date = options[ns+"until_date"]

passes_remaining = 0
if (ns+"passes" in options and options[ns+"passes"] > 1):
passes_remaining = options[ns+"passes"] - 1

missing_options = [o for o in ["reply_name"] if ((ns+o) not in options or not options[ns+o])]
if (len(missing_options) > 0):
logging.error("Required parser options weren't provided: %s." % ", ".join(missing_options))
raise common.ParserError("Parser failed.")

# List of pattern/replacement tuples to strip reply topic from comments.
reply_name_escaped = re.escape(options[ns+"reply_name"])
reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
(re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

query = urllib2.quote("@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"]))
if (since_date): query += urllib2.quote(" since:%s" % since_date.strftime("%Y-%m-%d"))
if (until_date): query += urllib2.quote(" until:%s" % until_date.strftime("%Y-%m-%d"))
original_url = "%s?q=%s&rpp=100&page=1&result_type=recent" % (search_url, query)
url = original_url
logging.debug("Url: %s" % url)

snarks = []
pass_result_count = 0

while (keep_alive_func() and url is not None):
notice_url = re.sub("[^?]+/search.json.*[?&](page=[0-9]+).*", "\g<1>", url)
logging.info("Parsing: %s" % notice_url)
sleep_func(1)

json_obj = None
try:
with contextlib.closing(urllib2.urlopen(url)) as server_response:
json_obj = json.loads(server_response.read())
except (urllib2.HTTPError) as err:
logging.error("Http status: %d" % err.code)
raise common.ParserError("Parser failed.")
except (urllib2.URLError) as err:
logging.error(str(err))
raise common.ParserError("Parser failed.")

if (not json_obj or "results" not in json_obj):
logging.error("Failed to parse search results.")
raise common.ParserError("Parser failed.")

pass_result_count += len(json_obj["results"])
for result in json_obj["results"]:
snark = {}
snark["user"] = "@%s" % result["from_user"]
snark["msg"] = result["text"]
for (reply_ptn, reply_rep) in reply_regexes:
snark["msg"] = reply_ptn.sub(reply_rep, snark["msg"])
snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

snark["date"] = datetime.strptime(result["created_at"] +" UTC", "%a, %d %b %Y %H:%M:%S +0000 %Z")

snark["user_url"] = "http://www.twitter.com/%s" % result["from_user"]
snark["msg_url"] = "http://twitter.com/#!/%s/status/%s" % (result["from_user"], result["id"])

snarks.append(snark)

if ("next_page" in json_obj):
url = search_url + json_obj["next_page"]
elif (passes_remaining > 0):
logging.info("Pass complete: %d results." % pass_result_count)
passes_remaining -= 1
url = original_url
pass_result_count = 0
else:
url = None

tweepy = tweepy_backend.get_tweepy()
tweepy_api = tweepy_backend.get_api()

try:
# List of pattern/replacement tuples to strip reply topic from comments.
reply_name_escaped = re.escape(options[ns+"reply_name"])
reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
(re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

rate_status, rate_parsed = tweepy_backend.rate_limit_status()
api_hp = rate_parsed["remaining_hits"]
limit_reset_date = rate_parsed["reset_time"]

search_args = {"rpp":100, "include_entities":"false", "result_type":"recent"}
search_args["q"] = "@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"])
if (since_date): search_args["since"] = since_date.strftime("%Y-%m-%d")
if (until_date): search_args["until"] = until_date.strftime("%Y-%m-%d")

searches = []
searches.append(("Search", tweepy_api.search, search_args, 1500))

for (search_type, tweepy_func, tweepy_func_args, search_cap) in searches:
done = False
query_count = 0
results_count = 0
last_max_id = None

while (keep_alive_func() and done is False and results_count < search_cap and api_hp > 0):
results = tweepy_func(**tweepy_func_args)
api_hp -= 1
if (not results):
done = True
break
else:
query_count += 1
results_count += len(results)
logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))

last_id = None
for search_result in results:
if (last_max_id == search_result.id): continue
last_id = search_result.id

snark = {}
snark["user"] = "@%s" % common.asciify(search_result.from_user)
snark["msg"] = search_result.text
for (reply_ptn, reply_rep) in reply_regexes:
snark["msg"] = reply_ptn.sub(reply_rep, snark["msg"])
snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

snark["date"] = search_result.created_at

snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(search_result.from_user)
snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(search_result.from_user), search_result.id)

if (until_date and snark["date"] > until_date):
continue # This snark is too recent.

if (since_date and snark["date"] < since_date):
done = True # This snark is too early.
break

snarks.append(snark)

if (first_msg):
if (snark["msg"].find(first_msg) != -1):
done = True # Found the first comment.
break

if (last_id is not None):
# Dig deeper into the past on the next loop.
tweepy_func_args["max_id"] = last_id
last_max_id = last_id
else:
# Must've only gotten the "max_id" tweet again.
done = True
break

if (limit_reset_date is not None and datetime.utcnow() >= limit_reset_date):
rate_status, rate_parsed = tweepy_backend.rate_limit_status()
api_hp = rate_parsed["remaining_hits"]
limit_reset_date = rate_parsed["reset_time"]
logging.info("API limit has reset. Calls left: %d (Resets: %s)" % (api_hp, rate_parsed["reset_time_string"]))

if (done is False and api_hp <= 0):
logging.warning("Twitter API rate limit truncated results.")
break # No more searches.

rate_status, rate_parsed = tweepy_backend.rate_limit_status()
logging.info("Twitter API calls left: %d/%d." % (rate_parsed["remaining_hits"], rate_parsed["hourly_limit"]))
logging.info("API limits will reset: %s." % (rate_parsed["reset_time_string"]))
logging.info("Current Time: %s UTC" % datetime.utcnow().strftime(rate_parsed["new_date_format"]))

except (Exception) as err:
logging.exception("Parser failed.")
raise common.ParserError("Parser failed.")

snarks = sorted(snarks, key=lambda k: k["date"])

# Drop duplicates from multiple passes.
snarks = uniquify_list(snarks)

logging.info("Search complete: %d combined results." % len(snarks))

if (first_msg):
first_index = -1
for i in range(len(snarks)):
Expand Down
1 change: 1 addition & 0 deletions readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Changes

3.52 - Made global fudge_time and ignore_users adjustable w/o reparsing.
Fixed glitchy volume control.
Made twitter_search parser use the Tweepy subsystem.
Changed 'message'/'msg' to 'comment' in descriptions.
3.51 - Added twitter_mentions parser.
Added threading for parsers and exporters.
Expand Down
1 change: 0 additions & 1 deletion share/example_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@
parser_options["twitter_search.reply_name"] = "MockTM"
parser_options["twitter_search.since_date"] = datetime(2012, 9, 13)
parser_options["twitter_search.until_date"] = datetime(2012, 9, 15)
parser_options["twitter_search.passes"] = 1

# Exporter-specific options.
exporter_options = {}
Expand Down

0 comments on commit 4ecb4e2

Please sign in to comment.