In [108]:
import pandas as pd
import json
import os
import tqdm
from urllib.request import urlopen
from datetime import datetime
from bs4 import BeautifulSoup
import GetOldTweets3 
import parser

In [109]:
def getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, useragent=None, debug=False):
    url = "https://twitter.com/i/search/timeline?"

    if not tweetCriteria.topTweets:
        url += "f=tweets&"

    url += ("vertical=news&q=%s&src=typd&%s"
            "&include_available_features=1&include_entities=1&max_position=%s"
            "&reset_error_state=false")

    urlGetData = ''
    
    # url + query search, since, since, until, lang    
    urlGetData += tweetCriteria.querySearch
    urlGetData += ' since:' + tweetCriteria.since
    urlGetData += ' until:' + tweetCriteria.until
    urlLang = 'l=' + tweetCriteria.lang + '&'
    
    # url 설정
    url = url % (urllib.parse.quote(urlGetData.strip()), urlLang, urllib.parse.quote(refreshCursor))
    useragent = useragent or TweetManager.user_agents[0]

    headers = [
        ('Host', "twitter.com"),
        ('User-Agent', useragent),
        ('Accept', "application/json, text/javascript, */*; q=0.01"),
        ('Accept-Language', "en-US,en;q=0.5"),
        ('X-Requested-With', "XMLHttpRequest"),
        ('Referer', url),
        ('Connection', "keep-alive")
    ]

    if proxy:
        opener = urllib.request.build_opener(urllib.request.ProxyHandler({'http': proxy, 'https': proxy}), urllib.request.HTTPCookieProcessor(cookieJar))
    else:
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
    opener.addheaders = headers        
    
    time.sleep(1) # HTTP Request 429 에러 방지.

    ##################### 에러 핸들링 수정 #####################
    
    # TimeOut 에러
    try:
        response = opener.open(url)
        jsonResponse = response.read()
    except TimeoutError as e:
        if debug: # 디버그 옵션
            print("에러 url 주소:", url)            
        print("Timeout error")
        print("30초 정지")
        time.sleep(30)

        # 한 번 더 시도
        try:
            response = opener.open(url)
            jsonResponse = response.read()
        except TimeoutError as e:
            print("Timeout error again. 패스.")
            pass

    # UrlParse 에러: 다시 시도 X
    except Exception as e:
        if debug: # 디버그 옵션
            print("에러 url 주소:", url)
            
        print("HTTP 요청 오류", str(e))        
        print("브라우저 오픈: https://twitter.com/search?q=%s&src=typd" % urllib.parse.quote(urlGetData))
        print("30초 정지")
        
        pass

    # Json 데이터 오류
    try:
        s_json = jsonResponse.decode()
    except:
        print("올바르지 못한 응답")
        if debug: # 디버그 옵션
            print("에러 url 주소:", url)
        pass
    else:
        try:
            dataJson = json.loads(s_json)
        except: # json 데이터 파싱 오류
            if debug: # 디버그 옵션
                print("에러 url 주소:", url)                
            print("JSON: %s" % s_json)
        pass

    return dataJson

In [110]:
dateRange = pd.date_range(start='20120601', end='20120701', freq='MS')

# setUntil : 마지막 날짜 배제되므로 주의.
def set_crawl_date(start_date, freq=1):    
    end_date = start_date + datetime.timedelta(days=freq)
    
    # timestamp to string format
    start_date = start_date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    
    # check
    print("트윗 수집 날짜 설정: {0}부터 {1}까지".format(start_date, end_date)) 


In [111]:
def crawl_tweets(start_date, end_date, 박근혜, lang='ko', debug=True):    
    '''
    query: 검색할 트윗 검색어
    lang: 검색할 트윗 언어
    debug: 설정 시 에러 url 표시
    '''
    
    print("========== 트윗 수집 시작: {0} ~ {1} ==========".format(start_date, end_date))
    start_time = time.time()
    tweet_criteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setLang(lang)
    tweets = got.manager.TweetManager.getTweets(tweet_criteria, debug=debug)
    
    elapsed_time = time.time()-start_time
    
    print("수집 완료 : {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))
    print("총 수집 트윗 개수 : {0}".format(len(tweets)))
    
    return tweets

In [112]:
def get_results(tweet_data):
    results = []
    for tweet in tqdm(tweet_data):
        results.append({'url': tweet.permalink,
                        'date': tweet.date,
                        'text': tweet.text,
                        'user': tweet.username,
                        'mentions': tweet.mentions,
                        'retweets': tweet.retweets,
                        'favorites': tweet.favorites,
                        'hashtags': tweet.hashtags})
    return results

In [113]:
def save_tweets(tweet_lists, base_file_dir="tweets"):
    
    if not os.path.exists(base_file_dir):
        os.makedirs(base_file_dir)
        
    with open(f"{base_file_dir}/tweets_{crawl_start}_{crawl_end}.csv", "a", -1, encoding="utf-8") as f:    
        writer = csv.writer(f)
        writer.writerow(['url', 'date', 'text', 'user', 'mentions', 'retweets', 'favorites', 'hashtags'])        
        for tweet_list in tqdm(tweet_lists):
            writer.writerow(list(tweet_list.values()))
            
    return 

In [114]:
dateRange = pd.date_range(start='20120601', end='20120701', freq='MS')

for date in dateRange:
    crawl_start, crawl_end = set_crawl_date(date) # freq 변경 가능
    tweet_results = crawl_tweets(crawl_start, crawl_end, query='SomeString')
    tweet_results_lists = get_results(tweet_results)
    save_tweets(tweet_results_lists)

AttributeError: type object 'datetime.datetime' has no attribute 'timedelta'