made twitter_search parser use Tweepy subsystem

Vhati · Oct 10, 2012 · 4ecb4e2 · 4ecb4e2
1 parent 92e3f7d
commit 4ecb4e2
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 90 deletions.
diff --git a/config.py b/config.py
@@ -86,7 +86,6 @@
 parser_options["twitter_search.reply_name"] = "MockTM"
 parser_options["twitter_search.since_date"] = datetime(2012, 9, 13)
 parser_options["twitter_search.until_date"] = datetime(2012, 9, 15)
-parser_options["twitter_search.passes"] = 1
 
 # Exporter-specific options.
 exporter_options = {}

diff --git a/lib/parsers/twitter_mentions.py b/lib/parsers/twitter_mentions.py
@@ -21,8 +21,8 @@
 
 def get_description():
   return ("Collects snarks from your Twitter screen name and @mentions.\n"+
-          "This is much more reliable than a plain search,\n"+
-          "but it only works for your own Twitter account.\n"+
+          "This is much more reliable than a plain search, "+
+          "but it only works for your own account.\n"+
           "On first use, it will prompt for authorization.")
 
 def get_arginfo():
@@ -38,7 +38,7 @@ def get_arginfo():
 def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
   """Collects snarks from your Twitter screen name and @mentions.
   This is much more reliable than a plain search, but it only
-  works for your own Twitter account.
+  works for your own account.
 
   This parser adds non-standard attributes to snarks:
   "user_url" and "msg_url", links to the user's twitter
@@ -102,7 +102,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
         results = tweepy_func(**tweepy_func_args)
         api_hp -= 1
         if (not results):
-          print "No more results"
           done = True
           break
         else:
@@ -124,7 +123,7 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
 
             snark["date"] = status.created_at
 
-            snark["user_url"] = "http://www.twitter.com/%s" % status.author.screen_name
+            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(status.author.screen_name)
             snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(status.author.screen_name), status.id)
 
             if (until_date and snark["date"] > until_date):

diff --git a/lib/parsers/twitter_search.py b/lib/parsers/twitter_search.py
@@ -10,19 +10,22 @@
 from lib import arginfo
 from lib import common
 from lib import global_config
+from lib.subsystems import tweepy_backend
 
 
 # Namespace for options.
 ns = "twitter_search."
 
 # Names of lib.subsystem modules that should be set up in advance.
-required_subsystems = []
+required_subsystems = ["tweepy_backend"]
 
 
 def get_description():
   return ("Collects snarks from a Twitter search.\n"+
-          "Finds tweets from an account and any @reply mentions of it.\n\n"+
-          "Note: Twitter's search API only reaches back a few days.")
+          "Finds tweets from any account and @reply mentions of it.\n\n"+
+          "Note: Twitter's search API only reaches back a few days "+
+          "and may be incomplete.\n"+
+          "On first use, it will prompt for authorization.")
 
 def get_arginfo():
   args = []
@@ -35,28 +38,20 @@ def get_arginfo():
   args.append(arginfo.Arg(name="until_date", type=arginfo.DATETIME,
               required=False, default=None, choices=None, multiple=False,
               description="UTC date to limit dredging up new tweets."))
-  args.append(arginfo.Arg(name="passes", type=arginfo.INTEGER,
-              required=False, default=1, choices=None, multiple=False,
-              description="Search X times to fill omissions in results."))
   return args
 
 def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
   """Collects snarks from a Twitter search. Finds
-  tweets from an account and any @reply mentions of it.
+  tweets from any account and @reply mentions of it.
   See: https://dev.twitter.com/docs/api/1/get/search
 
   This parser adds non-standard attributes to snarks:
   "user_url" and "msg_url", links to the user's twitter
   page and to the specific tweet. Exporters might
   disregard this info.
 
-  Any given search may be incomplete, but this parser can
-  make multiple passes to mitigate that. It's recommended
-  that you initially search and export to a temporary
-  pickled_snarks file; then parse THAT repeatedly to apply
-  adjustments and export to the final desired format.
-
-  Twitter's search API only reaches back a few days. :/
+  Twitter's search API only reaches back a few days
+  and may be incomplete. :/
 
   :param src_path: Not used.
   :param first_msg: If not None, ignore comments prior to one containing this substring.
@@ -67,8 +62,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
                       UTC Datetime to limit dredging up old tweets.
                   until_date (optional):
                       UTC Datetime to limit dredging up new tweets.
-                  passes (optional):
-                      Search X times to fill omissions in results.
   :param keep_alive_func: Optional replacement to get an abort boolean.
   :param sleep_func: Optional replacement to sleep N seconds.
   :return: A List of snark dicts.
@@ -77,8 +70,6 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
   if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
   if (sleep_func is None): sleep_func = global_config.nap
 
-  search_url = "http://search.twitter.com/search.json"
-
   since_date = None
   if (ns+"since_date" in options and options[ns+"since_date"]):
     since_date = options[ns+"since_date"]
@@ -87,83 +78,115 @@ def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_fu
   if (ns+"until_date" in options and options[ns+"until_date"]):
     until_date = options[ns+"until_date"]
 
-  passes_remaining = 0
-  if (ns+"passes" in options and options[ns+"passes"] > 1):
-    passes_remaining = options[ns+"passes"] - 1
-
   missing_options = [o for o in ["reply_name"] if ((ns+o) not in options or not options[ns+o])]
   if (len(missing_options) > 0):
     logging.error("Required parser options weren't provided: %s." % ", ".join(missing_options))
     raise common.ParserError("Parser failed.")
 
-  # List of pattern/replacement tuples to strip reply topic from comments.
-  reply_name_escaped = re.escape(options[ns+"reply_name"])
-  reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
-                   (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]
-
-  query = urllib2.quote("@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"]))
-  if (since_date): query += urllib2.quote(" since:%s" % since_date.strftime("%Y-%m-%d"))
-  if (until_date): query += urllib2.quote(" until:%s" % until_date.strftime("%Y-%m-%d"))
-  original_url = "%s?q=%s&rpp=100&page=1&result_type=recent" % (search_url, query)
-  url = original_url
-  logging.debug("Url: %s" % url)
-
   snarks = []
-  pass_result_count = 0
-
-  while (keep_alive_func() and url is not None):
-    notice_url = re.sub("[^?]+/search.json.*[?&](page=[0-9]+).*", "\g<1>", url)
-    logging.info("Parsing: %s" % notice_url)
-    sleep_func(1)
-
-    json_obj = None
-    try:
-      with contextlib.closing(urllib2.urlopen(url)) as server_response:
-        json_obj = json.loads(server_response.read())
-    except (urllib2.HTTPError) as err:
-      logging.error("Http status: %d" % err.code)
-      raise common.ParserError("Parser failed.")
-    except (urllib2.URLError) as err:
-      logging.error(str(err))
-      raise common.ParserError("Parser failed.")
-
-    if (not json_obj or "results" not in json_obj):
-      logging.error("Failed to parse search results.")
-      raise common.ParserError("Parser failed.")
-
-    pass_result_count += len(json_obj["results"])
-    for result in json_obj["results"]:
-      snark = {}
-      snark["user"] = "@%s" % result["from_user"]
-      snark["msg"] =  result["text"]
-      for (reply_ptn, reply_rep) in reply_regexes:
-        snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
-      snark["msg"] =  common.asciify(common.html_unescape(snark["msg"]))
-
-      snark["date"] = datetime.strptime(result["created_at"] +" UTC", "%a, %d %b %Y %H:%M:%S +0000 %Z")
-
-      snark["user_url"] = "http://www.twitter.com/%s" % result["from_user"]
-      snark["msg_url"] = "http://twitter.com/#!/%s/status/%s" % (result["from_user"], result["id"])
-
-      snarks.append(snark)
-
-    if ("next_page" in json_obj):
-      url = search_url + json_obj["next_page"]
-    elif (passes_remaining > 0):
-      logging.info("Pass complete: %d results." % pass_result_count)
-      passes_remaining -= 1
-      url = original_url
-      pass_result_count = 0
-    else:
-      url = None
+
+  tweepy = tweepy_backend.get_tweepy()
+  tweepy_api = tweepy_backend.get_api()
+
+  try:
+    # List of pattern/replacement tuples to strip reply topic from comments.
+    reply_name_escaped = re.escape(options[ns+"reply_name"])
+    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
+                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]
+
+    rate_status, rate_parsed = tweepy_backend.rate_limit_status()
+    api_hp = rate_parsed["remaining_hits"]
+    limit_reset_date = rate_parsed["reset_time"]
+
+    search_args = {"rpp":100, "include_entities":"false", "result_type":"recent"}
+    search_args["q"] = "@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"])
+    if (since_date): search_args["since"] = since_date.strftime("%Y-%m-%d")
+    if (until_date): search_args["until"] = until_date.strftime("%Y-%m-%d")
+
+    searches = []
+    searches.append(("Search", tweepy_api.search, search_args, 1500))
+
+    for (search_type, tweepy_func, tweepy_func_args, search_cap) in searches:
+      done = False
+      query_count = 0
+      results_count = 0
+      last_max_id = None
+
+      while (keep_alive_func() and done is False and results_count < search_cap and api_hp > 0):
+        results = tweepy_func(**tweepy_func_args)
+        api_hp -= 1
+        if (not results):
+          done = True
+          break
+        else:
+          query_count += 1
+          results_count += len(results)
+          logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))
+
+          last_id = None
+          for search_result in results:
+            if (last_max_id == search_result.id): continue
+            last_id = search_result.id
+
+            snark = {}
+            snark["user"] = "@%s" % common.asciify(search_result.from_user)
+            snark["msg"] = search_result.text
+            for (reply_ptn, reply_rep) in reply_regexes:
+              snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
+            snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))
+
+            snark["date"] = search_result.created_at
+
+            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(search_result.from_user)
+            snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(search_result.from_user), search_result.id)
+
+            if (until_date and snark["date"] > until_date):
+              continue  # This snark is too recent.
+
+            if (since_date and snark["date"] < since_date):
+              done = True  # This snark is too early.
+              break
+
+            snarks.append(snark)
+
+            if (first_msg):
+              if (snark["msg"].find(first_msg) != -1):
+                done = True  # Found the first comment.
+                break
+
+          if (last_id is not None):
+            # Dig deeper into the past on the next loop.
+            tweepy_func_args["max_id"] = last_id
+            last_max_id = last_id
+          else:
+            # Must've only gotten the "max_id" tweet again.
+            done = True
+            break
+
+          if (limit_reset_date is not None and datetime.utcnow() >= limit_reset_date):
+            rate_status, rate_parsed = tweepy_backend.rate_limit_status()
+            api_hp = rate_parsed["remaining_hits"]
+            limit_reset_date = rate_parsed["reset_time"]
+            logging.info("API limit has reset. Calls left: %d (Resets: %s)" % (api_hp, rate_parsed["reset_time_string"]))
+
+      if (done is False and api_hp <= 0):
+        logging.warning("Twitter API rate limit truncated results.")
+        break  # No more searches.
+
+    rate_status, rate_parsed = tweepy_backend.rate_limit_status()
+    logging.info("Twitter API calls left: %d/%d." % (rate_parsed["remaining_hits"], rate_parsed["hourly_limit"]))
+    logging.info("API limits will reset: %s." % (rate_parsed["reset_time_string"]))
+    logging.info("Current Time: %s UTC" % datetime.utcnow().strftime(rate_parsed["new_date_format"]))
+
+  except (Exception) as err:
+    logging.exception("Parser failed.")
+    raise common.ParserError("Parser failed.")
 
   snarks = sorted(snarks, key=lambda k: k["date"])
 
   # Drop duplicates from multiple passes.
   snarks = uniquify_list(snarks)
 
-  logging.info("Search complete: %d combined results." % len(snarks))
-
   if (first_msg):
     first_index = -1
     for i in range(len(snarks)):

diff --git a/readme.txt b/readme.txt
@@ -53,6 +53,7 @@ Changes
 
 3.52 - Made global fudge_time and ignore_users adjustable w/o reparsing.
        Fixed glitchy volume control.
+       Made twitter_search parser use the Tweepy subsystem.
        Changed 'message'/'msg' to 'comment' in descriptions.
 3.51 - Added twitter_mentions parser.
        Added threading for parsers and exporters.

diff --git a/share/example_config.py b/share/example_config.py
@@ -86,7 +86,6 @@
 parser_options["twitter_search.reply_name"] = "MockTM"
 parser_options["twitter_search.since_date"] = datetime(2012, 9, 13)
 parser_options["twitter_search.until_date"] = datetime(2012, 9, 15)
-parser_options["twitter_search.passes"] = 1
 
 # Exporter-specific options.
 exporter_options = {}