In [1]:
import tweepy
import pandas as pd
import os
import datetime
import json
import boto3
import io
import time

In [3]:
def upload_raw_data(key,data,s3_client,keyword,date):
    buf = io.StringIO()

    # write each dictionary to a separate line in the buffer
    for item in data:
        if key == 'tweets':
            item['keyword'] = keyword
        json.dump(item, buf)
        buf.write('\n')
    
    date = str(date.date())
    
    response = s3_client.put_object(
        Bucket='santal', 
        Key=f'{key}/{keyword}/{date}.json',
        Body=buf.getvalue()
        )
    
    return response

In [4]:
def search_and_upload(keyword,tweepy_client,s3_client,max_page = 100):
    
    tweet_fields = ['created_at','public_metrics','possibly_sensitive','text','id','geo']
    expansions = ['author_id','geo.place_id']
    place_fields = ['id','country','full_name','contained_within','place_type','geo']
    user_fields = ['description','id','location','name','verified','verified_type']
    
    
    date = datetime.datetime.today() - datetime.timedelta(days = 1)
    query = f"{keyword} -is:retweet -is:reply lang:EN"
    
    pages = 0
    tweets = []
    users = []
    places = []
    
    #100 page * 100 result
    next_token = None
    
    while pages < max_page:
        response = tweepy_client.search_recent_tweets(query=query,start_time = date,
                                 next_token = next_token,
                                 tweet_fields = tweet_fields,
                                 place_fields = place_fields,
                                 user_fields = user_fields,
                                 expansions=expansions,
                                 max_results=100)
        
        tweets+=response['data']
        users+=response['includes']['users']
        try:
            places+=response['includes']['places']
        except:
            pass
        
        pages+=1
        try:
            next_token = response['meta']['next_token']
            print(pages)
        except:
            print(pages*100 + response['meta']['result_count'])
            break
        
        time.sleep(3)
    return response
#     upload_raw_data('tweets',tweets,s3_client,keyword,date)
#     upload_raw_data('users',users,s3_client,keyword,date)
#     upload_raw_data('places',places,s3_client,keyword,date)

In [7]:
#build tweepy client
bearer_token = os.environ['TWEEPY_BEARER_TOKEN']
tweepy_client = tweepy.Client(bearer_token=bearer_token,return_type=dict,wait_on_rate_limit=True)

#build s3 client
aws_key = os.environ['AWS_ACCESS_KEY']
aws_secret = os.environ['AWS_ACCESS_SECRET']
s3_client = boto3.client('s3',aws_access_key_id=aws_key,aws_secret_access_key=aws_secret)
search_and_upload('test',tweepy_client,s3_client,max_page=1)

1


{'data': [{'text': 'Bernie Sanders: Nikki Haley’s demand for mental tests is ageist and ‘absurd’ Bernie is afraid he can’t pass the test along with many other liberal dems with mental disorders!',
   'author_id': '1306583233279164416',
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0,
    'impression_count': 0},
   'id': '1627490375492603904',
   'edit_history_tweet_ids': ['1627490375492603904'],
   'possibly_sensitive': False,
   'created_at': '2023-02-20T02:08:37.000Z'},
  {'text': '✰ math commissions,algebra genmath,integral calculus ,geometry ,trigonometry ,statistics , proving , solving , timed exam , assignment , seatwork , activities , test , quizzes, science , physics,chemistry , biology, earth science, rush lf client',
   'author_id': '1494191459641475074',
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0,
    'impression_count': 0},
   'id': '1627490366126714880',