In [1]:
#importing the required libraries
import os
import time
from ast import literal_eval
from datetime import datetime
import pandas as pd
import snscrape.modules.twitter as sntwitter # the magic
import warnings
warnings.filterwarnings('ignore')

In [2]:
#setting up the directory
#root_dir = os.path.abspath(os.path.join(os.path.dirname("."), '.'))
#output: /Users/sbp

In [3]:
#Extractor Module
class ExtractTweets:
    
    def __init__(self, 
                 minTweetCountPerDay=10, 
                 minRetweetCount=0,
                 minLikeCount=0, 
                 minFollowersCount=0, 
                 VerifiedStatus=None, 
                 saveBufferDuration=3600):
        """
        Accepts basic input params each of integer datatype, except for VerifiedStatus which accepts boolean or None.
        """
        self.start_timer = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
        self.min_tweet_count_perDay = minTweetCountPerDay
        self.minRetweetCount = minRetweetCount
        self.minLikeCount = minLikeCount
        self.minFollowersCount = minFollowersCount
        self.VerifiedStatus = VerifiedStatus                           
        self.tweets_df = pd.DataFrame(columns=['date', 'tweet', 'lang', 'retweetCount', 'likeCount', 'replyCount', 
                                               'username', 'user_followersCount','user_friendsCount', 'verifiedStatus', 
                                               'tweet_url', 'hastags', 'chr_count', 'topic'])
        self.save_buffer_duration = saveBufferDuration
        return

    def save_copy(self):
        """
        Saves a temp copy for restoration and prevent API time limit exceed error.
        
        :return:
        pandas dataframe containing twitter record-data.
        """
        data = self.tweets_df.reset_index(drop=True)
        data['date'] = data['date'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d'))
        for filename in os.listdir("."):
            if filename.endswith('local.csv'):
                os.remove(filename)
        data.to_csv("./save_{}_local.csv".format(data.date.max()), index=False)
        return data
        
    def getTweets(self, start_date, end_date, keywords):
        """
        Extracts historical twitter data.
        
        :params:
        start_date - str in "YYYY-MM-DD" format
        end_date - str in "YYYY-MM-DD" format
        keywords - list of tuples, 
            e.g, [('recession'), ('football, 'worldcup', 'fifa'), ('war', 'ukraine')]
            e.g. ['recession']
        
        :return:
        pandas dataframe with features as:
         date: Tweet Timestamp
         tweet: tweet content
         lang: language classifer used by parent api
         retweetCount: tweet retweeted count
         likeCount: tweet like count
         replyCount: number of replies to original tweet
         username: user who tweeted
         user_followersCount: number of followers user has (tells you how popular the avg tweets are)
         user_friendsCount: number of friends user has
         verifiedStatus: If the user is Verified or not (i.e. pays 8 bucks every month!)
         tweet_url: Link of original tweet (click and see)
         hastags: If any hastags were used (hastags are important for search and info retrieval)
         chr_count: number of english characters in the original tweet
         topic: keywords you used for searching tweets (kind of labels)
        """
        
        if not(isinstance(keywords, list) or isinstance(keywords, tuple)):
            raise Exception("Incorrect Input Format! Please pass a list")
        
        for topic in keywords:
            # for saving local copies every buffer_hour
            st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
            date = pd.to_datetime(start_date, format='%Y-%m-%d')
            e_date = pd.to_datetime(end_date, format='%Y-%m-%d') + pd.to_timedelta(1, unit='d')
            if isinstance(topic, tuple) or isinstance(topic, list):
                topic = " ".join(topic)
            search_query = topic
            print("search_query:", search_query)
        
            while date != e_date:
                nxt_date = date + pd.to_timedelta(1, unit='d')
                content = '{} since:{} until:{}'.format(search_query, date.strftime('%Y-%m-%d'), nxt_date.strftime('%Y-%m-%d'))
                print(content)
                
                # check for save buffer duration (set to 1 Hr by default)
                delta_buffer = (datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - st_time).seconds
                if delta_buffer >= self.save_buffer_duration:
                    self.save_copy()
                    # reset buffer
                    st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")

                lst_tweets = []
                for counter, tweet in enumerate(sntwitter.TwitterSearchScraper(content).get_items()):
                    if counter+1 > self.min_tweet_count_perDay: 
                        break
                    if tweet.likeCount >= self.minLikeCount \
                        or tweet.retweetCount >= self.minRetweetCount \
                        or tweet.user.followersCount >= self.minFollowersCount \
                        or (tweet.user.verified and isinstance(tweet.user.verified, bool) and tweet.user.verified == self.VerifiedStatus):
                        
                        # ----------------------------------------------------------------
                        # Potential custom preprocessing module here: 
                        # 1. Simple and short: https://www.kaggle.com/code/zenbird01/pranjalpathak-semantic-clustering-v1-0/notebook
                        # 2. Advanced: ./NLP_basics_preprocessing_vectorization_similarity.ipynb
                        # 3. Best: Check github - https://github.com/pranzell/NLP_Tools
                        # ----------------------------------------------------------------
                        
                        lst_tweets.append([
                            tweet.date, 
                            tweet.content, 
                            tweet.lang,
                            tweet.retweetCount,
                            tweet.likeCount,
                            tweet.replyCount,
                            tweet.user.username, 
                            tweet.user.followersCount, 
                            tweet.user.friendsCount, 
                            tweet.user.verified,
                            tweet.url,
                            tweet.hashtags,
                            len(str(tweet.content).strip()),
                            topic])
                
                self.tweets_df = self.tweets_df.append(pd.DataFrame(lst_tweets, columns=self.tweets_df.columns))
                date = nxt_date
        
        print("\n\nTOTAL TIME TAKEN {} minutes".format(((datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - self.start_timer).seconds)/60.0))
        return self.save_copy()
    
    
    def preprocess_shortText(self, text_col):
        # refer to Preprocessing ipynb file
        # https://github.com/pranzell/NLP_Tools
        pass

In [21]:
#Configuration
minTweetCountPerDay=50
minRetweetCount=100
minLikeCount=100
minFollowersCount=200
VerifiedStatus=None
saveBufferDuration=3600 # in seconds

start_date = "2022-06-01"
end_date = "2023-01-31"

# list of tuples, or a list of single str items check function definition `getTweets()`
#keywords = [('veganism'), ('vegan','govegan','veganfortheanimals','animalsrights','veganism')]
#keywords = [('non veganism'), ('antivegan','exvegan','yestomeat','meatlover','antiveganism')]


In [22]:
#Execution
#keywords = [('non veganism'), ('antivegan','exvegan','yestomeat','meatlover','antiveganism')]

et = ExtractTweets(minTweetCountPerDay, minRetweetCount, minLikeCount, minFollowersCount, VerifiedStatus, saveBufferDuration)
twitter_data = et.getTweets(start_date, end_date, keywords)

search_query: non veganism
non veganism since:2022-06-01 until:2022-06-02
non veganism since:2022-06-02 until:2022-06-03
non veganism since:2022-06-03 until:2022-06-04
non veganism since:2022-06-04 until:2022-06-05
non veganism since:2022-06-05 until:2022-06-06
non veganism since:2022-06-06 until:2022-06-07
non veganism since:2022-06-07 until:2022-06-08
non veganism since:2022-06-08 until:2022-06-09
non veganism since:2022-06-09 until:2022-06-10
non veganism since:2022-06-10 until:2022-06-11
non veganism since:2022-06-11 until:2022-06-12
non veganism since:2022-06-12 until:2022-06-13
non veganism since:2022-06-13 until:2022-06-14
non veganism since:2022-06-14 until:2022-06-15
non veganism since:2022-06-15 until:2022-06-16
non veganism since:2022-06-16 until:2022-06-17
non veganism since:2022-06-17 until:2022-06-18
non veganism since:2022-06-18 until:2022-06-19
non veganism since:2022-06-19 until:2022-06-20
non veganism since:2022-06-20 until:2022-06-21
non veganism since:2022-06-21 unt

non veganism since:2022-11-22 until:2022-11-23
non veganism since:2022-11-23 until:2022-11-24
non veganism since:2022-11-24 until:2022-11-25
non veganism since:2022-11-25 until:2022-11-26
non veganism since:2022-11-26 until:2022-11-27
non veganism since:2022-11-27 until:2022-11-28
non veganism since:2022-11-28 until:2022-11-29
non veganism since:2022-11-29 until:2022-11-30
non veganism since:2022-11-30 until:2022-12-01
non veganism since:2022-12-01 until:2022-12-02
non veganism since:2022-12-02 until:2022-12-03
non veganism since:2022-12-03 until:2022-12-04
non veganism since:2022-12-04 until:2022-12-05
non veganism since:2022-12-05 until:2022-12-06
non veganism since:2022-12-06 until:2022-12-07
non veganism since:2022-12-07 until:2022-12-08
non veganism since:2022-12-08 until:2022-12-09
non veganism since:2022-12-09 until:2022-12-10
non veganism since:2022-12-10 until:2022-12-11
non veganism since:2022-12-11 until:2022-12-12
non veganism since:2022-12-12 until:2022-12-13
non veganism 

antivegan exvegan yestomeat meatlover antiveganism since:2022-07-28 until:2022-07-29
antivegan exvegan yestomeat meatlover antiveganism since:2022-07-29 until:2022-07-30
antivegan exvegan yestomeat meatlover antiveganism since:2022-07-30 until:2022-07-31
antivegan exvegan yestomeat meatlover antiveganism since:2022-07-31 until:2022-08-01
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-01 until:2022-08-02
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-02 until:2022-08-03
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-03 until:2022-08-04
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-04 until:2022-08-05
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-05 until:2022-08-06
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-06 until:2022-08-07
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-07 until:2022-08-08
antivegan exvegan yestomeat meatlover antiveganism since:2022-08-

antivegan exvegan yestomeat meatlover antiveganism since:2022-11-02 until:2022-11-03
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-03 until:2022-11-04
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-04 until:2022-11-05
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-05 until:2022-11-06
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-06 until:2022-11-07
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-07 until:2022-11-08
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-08 until:2022-11-09
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-09 until:2022-11-10
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-10 until:2022-11-11
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-11 until:2022-11-12
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-12 until:2022-11-13
antivegan exvegan yestomeat meatlover antiveganism since:2022-11-

In [23]:
print(twitter_data.shape)
twitter_data.head(3)

(711, 14)


Unnamed: 0,date,tweet,lang,retweetCount,likeCount,replyCount,username,user_followersCount,user_friendsCount,verifiedStatus,tweet_url,hastags,chr_count,topic
0,2022-06-01,@TheFun96593011 @S_Catsgotmyback @Son_of_Space...,en,0,2,0,ukwondering,2203,2013,False,https://twitter.com/ukwondering/status/1532067...,,430,non veganism
1,2022-06-01,@TheFun96593011 @S_Catsgotmyback @Son_of_Space...,en,0,1,2,ukwondering,2203,2013,False,https://twitter.com/ukwondering/status/1532067...,,340,non veganism
2,2022-06-01,@TheFun96593011 @S_Catsgotmyback @Son_of_Space...,en,0,10,1,ukwondering,2203,2013,False,https://twitter.com/ukwondering/status/1532066...,,354,non veganism


In [24]:
twitter_data.to_csv('/Users/sbp/Downloads/vegan1_sanchana.csv', index=False)

In [None]:
def read_copy(path=""):
    for f in os.listdir(path):
        if f.endswith('local.csv'):
            df = pd.read_csv(f, lineterminator='\n')
            df.hastags = df.hastags.apply(lambda x: literal_eval(x) if str(x) not in ['none', 'nan', 'np.nan', 'null', ''] else None)
            return df