In [13]:
import json
import csv
import tweepy
import re
import os

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
consumer_key = ''
consumer_secret =''
access_token = ''
access_token_secret = ''

In [17]:
def search_for_phrase(consumer_key, consumer_secret, access_token, access_token_secret, phrase):
    
    # create authentication (accessing Twitter)
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    # initialize Tweepy API
    api = tweepy.API(auth)
    
    # set filename (based on the phrase we're searching for)
    filename = phrase + '.csv'
    
    # open the csv we will write to
    with open(filename, 'a') as file:

        w = csv.writer(file)
        
        # write header row to csv if the file is empty
        if(os.path.getsize(filename) == 0):
            w.writerow(['tweet_id', 'timestamp', 'user_id', 'tweet_text'])


        #for each tweet matching our phrase, gather the relevant info
        for tweet in tweepy.Cursor(api.search, q=phrase+' -filter:retweets', \
                                   lang="en", tweet_mode='extended').items(150):
            
            # removing stop words
            tweet_text = tweet.full_text.replace('\n',' ')
            text_tokens = word_tokenize(tweet_text)
            clean_tweet_text = [w for w in text_tokens if not w in stop_words]
            
            # convert list object to string
            clean_tweet_string = ' '.join([str(elem) for elem in clean_tweet_text])

            # remove numbers
            no_num = ''.join([num for num in clean_tweet_string if not num.isdigit()])
            
            # remove punctuations
            no_punc = re.sub(r'[^\w\s]','', no_num)
            
            # remove words that less than 2 characters
            final_string = ' '.join(word for word in no_punc.split() if len(word)>1)

            w.writerow([
                tweet.id,
                tweet.created_at.isoformat(),
                tweet.user.id,
                final_string.encode('utf-8').strip()
            ])

In [18]:
if __name__ == '__main__':
    
    # the phrases to search for
    phrases = ['altcoin', 'bitcoin', 'coindesk', 'cryptocurrency', 'gold', 'aapl', 'goog', 'yhoo']
    
    # search for each phrase
    for ph in phrases:
        search_for_phrase(consumer_key, consumer_secret, access_token, access_token_secret, ph)