THIS IS THE MAIN FUNCTION INCLUDES: EXTRACTING STREAMING DATA FROM TWITTER, PRE-PROCESSING AND STORES INTO MYSQL.

In [13]:
#SETTING UP

import credentials              #IMPORTING CREDENTIALS.PY - IMPORTS API/ACCESS_TOKEN KEYS
import settings                 #IMPORT SETTINGS
import re
import tweepy
import pandas as pd
from textblob import TextBlob
import mysql.connector
import emoji

In [14]:
#AUTHENTICATING TWITTER API
client=tweepy.Client(credentials.BEARER_TOKEN,credentials.API_KEY,credentials.API_SECRET_KEY,credentials.ACCESS_TOKEN,credentials.ACCESS_TOKEN_SECRET)
auth=tweepy.OAuth1UserHandler(credentials.API_KEY,credentials.API_SECRET_KEY,credentials.ACCESS_TOKEN,credentials.ACCESS_TOKEN_SECRET)
api=tweepy.API(auth,wait_on_rate_limit=True)

#VERIFIES CREDENTIONS FOR AUTHENTICATION
try:
    api.verify_credentials()
    print('Succesfull Authentication')
except:
    print('Failed Authentication')

Succesfull Authentication


In [15]:
# STORE DATA IN MYSQL, CONNECTING TO SQLDATABASE
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd=credentials.MYSQLPASSWORD,
    database="twitterdb",
    auth_plugin='mysql_native_password',
    charset = 'utf8'
)

if mydb.is_connected():
#CHECK TO SEE IF TABLE EXISTS. IF NOT, CREATES ONE.
    mycursor = mydb.cursor()
    mycursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_name = '{0}'
        """.format(settings.TABLE_NAME))
    if mycursor.fetchone()[0] != 1:
        mycursor.execute("CREATE TABLE {} ({})" \
            .format(settings.TABLE_NAME, settings.TABLE_ATTRIBUTES))
        mydb.commit()
    mycursor.close()

In [80]:
#FUNCTIONS

#PRE-PROCESSING
def clean_tweet(tweet):
     #REMOVE "\n" IN TWEETS, \n SEPERATES LINES VIA NEW LINE CHARACTER, REMOVES CLUTER
     tweet=tweet.replace('\n', ' ')
     #REMOVE LINKS
     tweet=re.sub(r'https\S+','',tweet)
     tweet=re.sub(r'www\.\S+', '', tweet)
     #REMOVE MENTIONS
     tweet=re.sub(r'@\S+\s?', '', tweet)
     #REMOVE EMOJIS
     tweet=''.join(emoji.replace_emoji(tweet,replace=''))
     
     #TWITTER NEW TEXT LIMITS FOR TWITTER BLUE USERS INCLUDES TWEET LENGTH OF UPTO 4,000 CHARACTERS. WILL STORE TWEETS UPTO 255 CHRACTERS.
     return tweet.strip()[:255]


#SQL PUSH TO TABLES, FUNCTION INSERTS DATA INTO TABLE USING DATA IN THE 'STRUCT' DICTIONARY
def push_results_to_tables(table_name, struct, conn):
    cursor = conn.cursor()
    insert_SQL = f"""INSERT INTO {table_name}
                ({', '.join(map(str, struct))})
                VALUES('{"','".join(map(str,struct.values()))}');
                """
    cursor.execute(insert_SQL)
    cursor.commit()

In [66]:
class MyStream(tweepy.StreamingClient):

    # DISPLAYS "CONNECTED" ONCE STREAM IS CONNECTED
    def on_connect(self):        
        print("Connected") 

    # AVOID RETWEETED TWEETS, NON-ENGLISH TWEETS AND TWEETS WITH ATTACHMENTS, ONLY ORIGINAL ENGLISH TWEETS WITH NO ATACHMENTS ARE STORED 

    def on_tweet(self,tweet):
        if tweet.referenced_tweets is None:
            return True
       
        if tweet.lang !="en":
            return True
        
        if tweet.attachments is None:
            return True
        
    #EXTRACTING ATTRIBUTES FROM TWEETS
        id_str = tweet.id
        created_at = tweet.created_at
        text = clean_tweet(tweet.text) # PRE-PROCESSING
        user_id = tweet.author_id
        ref_id=tweet.referenced_tweets
        lang=tweet.lang
        sentiment = TextBlob(text).sentiment
        polarity = sentiment.polarity
        subjectivity = sentiment.subjectivity
        user=client.get_user(id=user_id)
        struct={'id_str':id_str,
                'created_at':tweet.created_at,
                'text':text,
                'polarity':polarity,
                'subjectivity':subjectivity}
        print(struct)

        if mydb.is_connected():
            mycursor = mydb.cursor()
            sql = "INSERT INTO {} (id_str, created_at, text, polarity, subjectivity) VALUES (%s, %s, %s, %s, %s)".format(settings.TABLE_NAME)
            val = (id_str, created_at, text, polarity, subjectivity, )
            mycursor.execute(sql, val)
            mydb.commit()
            mycursor.close()

## TWITTER HAS RATE LIMITS, STOP DATA SCARPING AFTER THRESHOLD.
    def on_error(self,status_code):
        if status_code == 420:
            return False
        print(status_code)
         

In [None]:
stream = MyStream(bearer_token=credentials.BEARER_TOKEN)

# CLEARS RULESET BEFORE STREAMING DATA
for rule in stream.get_rules().data:
        stream.delete_rules(rule.id)
# ADDING RULES TO RULESET TO STREAM SPECIFIC DATA
# stream.add_rules(tweepy.StreamRule("($ETH OR #ETH OR ETH OR Ethereum) -is:retweet -#giveaway -#nfts -nft -#nft -airdrop -#airdrop -presale -#presale -whitelist -#whitelist -wallets -gm -mfer -#whale"))
stream.add_rules(tweepy.StreamRule("($ETH OR #ETH OR ETH OR Ethereum) -is:retweet "))
stream.add_rules(tweepy.StreamRule("-giveaway -nfts -nft -#nft -#nfts -airdrop -#whale -presale -airdrop -whitelist -#whitelist"))

#START STREAM
stream.filter(expansions=["author_id",],tweet_fields=["created_at","referenced_tweets","lang","attachments"]) 
