<a href="https://colab.research.google.com/github/Sofia-1234567/Forensic-Research-Project-2021/blob/main/Tweets_Collection_for_FRP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Twitter properties file contains the Twitter accesstoken, accesstokensecret,apikey and apisecretkey. 
!ls '/content/drive/My Drive/Colab Notebooks/twitterproperties.txt'

'/content/drive/My Drive/Colab Notebooks/twitterproperties.txt'


In [3]:
#Tweepy is a Python library that allows us to access Twitter API. 
!pip install tweepy
import tweepy as tw
print(tw.__version__) #this will print the ver. of tweepy that has been imported

#We will use the pandas library (a library for data manipulation and analysis) to build a dataframe for the collected tweets
import pandas as pd

3.10.0


In [4]:
#We will need the configparser package for reading the properties file from the Google Drive
!pip install ConfigParser

Collecting ConfigParser
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Installing collected packages: ConfigParser
Successfully installed ConfigParser-5.0.2


In [5]:
#In the properties file, the tokens and secret keys that will allow us to access the Twitter API are under a section called 'Twitter' 
import pandas as pd
import configparser
config = configparser.RawConfigParser()
config.read('/content/drive/My Drive/Colab Notebooks/twitterproperties.txt')

print(config.sections());

['twitter']


In [6]:
#Assign the contents of the Twitter section of the properties file to the the token and secret key variables. 
accesstoken=config.get('twitter', 'accesstoken')
accesstokensecret=config.get('twitter', 'accesstokensecret')
apikey=config.get('twitter', 'apikey')
apisecretkey=config.get('twitter', 'apisecretkey')

In [7]:
#Tweepy AuthHandler authorises the keys and tokens to be accessed 
auth = tw.OAuthHandler(apikey, apisecretkey)
auth.set_access_token(accesstoken, accesstokensecret)

#calling on auth and applying a wait limit so the server is not constantly pulling tweets (prevents crashing)
#when the treshold of collected tweets is met, it will refresh in another 15 minutes to pull the next set of tweets
api = tw.API(auth, wait_on_rate_limit=True)


In [8]:
#searches for words and phrases (inside "") in one tweet using the OR and AND functions
search_words = 'covid OR covid-19 OR coronavirus OR covid19 OR corona' '-filter:retweets AND filter:twimg'

#specify date we want the tweets from
date_since = "2020-01-01"

#tells the tweepy cursor we want to perform an API search of search_words, with language of the tweets being english, and to pull n tweets (items) at a time. 
tweets = tw.Cursor(api.search,
                   q=search_words,
                   lang="en",
                   since=date_since, tweet_mode='extended', include_entities=True).items(1000) 

In [9]:
#the items that will be extracted per tweet are: unique tweet identifier (as an integer), the full text of the tweet, the date tweet was created, and the URL to the image in attached to the tweet 
tweet_details = [[tweet.id, tweet.full_text, tweet.created_at, tweet.entities['media'][0]['media_url_https']] for tweet in tweets]

In [10]:
#creating a dataframe of tweet_details
#column headings are: 
tweet_df = pd.DataFrame(data=tweet_details, columns=['tweet_id','tweet_txt', 'created_at', 'tweet_img_URL'])

#sets the maximum width in characters of a column. When the column overflows, a "..." placeholder is embedded in the output. 
pd.set_option('max_colwidth', 400)

In [11]:
#filters the dataframe to only include rows containing the string "pbs.twimg.com/media" i.e. this will filter out entries linking to GIFs and video thumbnail
tweet_df = tweet_df[tweet_df['tweet_img_URL'].str.contains("pbs.twimg.com/media")]

#this will show us the first 5 rows/entries in the dataframe
tweet_df.head(5)

Unnamed: 0,tweet_id,tweet_txt,created_at,tweet_img_URL
0,1379267554548535301,"Covid-19 Update!\n\nAt Blush, we are still committed to everyone's safety.\n\nWearing face masks and practicing social distancing while in the salon are still required. \n\nWe sure do hope that we can see your smiling faces soon!!😀\n\nThank you for your understanding &amp; cooperation. 💗 https://t.co/2GSa2s2SRF",2021-04-06 02:59:45,https://pbs.twimg.com/media/EyQkh7fXEAAoI_w.jpg
1,1379267544536743939,"Fox Business host Kennedy on Monday spoke with Mediaite’s own Colby Hall about the backlash CBS News and 60 Minutes faced for their story over the weekend hitting Florida Gov. Ron DeSantis over vaccine distribution.\n“Ah, the lefty media. #60Minutes\n\nhttps://t.co/mOGwSVyMJN https://t.co/XArBmZNP92",2021-04-06 02:59:42,https://pbs.twimg.com/media/EyQkhacWEAEpFqz.png
2,1379267534042578944,"We can help you achieve your goals, even from home!\n\nLearn more: https://t.co/JYUtfNf6Qp\n\n#fitnessgoals #homegym #goals https://t.co/R93osjQi8V",2021-04-06 02:59:40,https://pbs.twimg.com/media/EyQkg0tWEAExi-s.jpg
3,1379267533874806785,#CCPVirus_Joke\n\nThanks Salty Cracker @SaltyCracker9 #MAGA\n \n#CoronaVirus #WuhanVirus #COVID19 \n\nqt-covid19-jokes-072 #BorderObserver https://t.co/7YGFi9Z7Uw,2021-04-06 02:59:40,https://pbs.twimg.com/media/EyQkg0-WQAIyUJS.jpg
4,1379267528057356288,"This is not a #CCPVirus_Joke, but funny\n\nIs your #meth contaminated with #coronavirus? This Florida police dept. will test it for free \nMar 4, 2020\nhttps://t.co/l4UyCZxKdO https://t.co/4Jg5G6ymgL\n\n#methamphetamine #COVID19 #CCPVirus\n\nqt-covid19-jokes-052 #BorderObserver",2021-04-06 02:59:38,https://pbs.twimg.com/media/ESRjgacXcAIzfFn.jpg


In [12]:
#this will give us the dimensions of the dataframe, i.e. how many rows and how many columns --> output is: (no. of rows, no. of columns)
tweet_df.shape

(896, 4)

In [13]:
#dropping duplicate values in column "tweet_txt"

#first, lets do some simple cleaning. Here, @ mentions are replaced with "@user", and https links are replaced with nothing (i.e. they are removed).
import re
def clean_tweets(Tweet):
    Tweet = re.sub("@[\w]*","@user",Tweet)
    Tweet = re.sub("https?://[A-Za-z0-9./]*","",Tweet)
    return Tweet

tweet_df['tweet_txt']=tweet_df['tweet_txt'].apply(lambda x: clean_tweets(x))

#drop duplicate rows and keep the first instance
tweet_df.drop_duplicates(subset="tweet_txt", keep='first')

tweet_df.head(5)

Unnamed: 0,tweet_id,tweet_txt,created_at,tweet_img_URL
0,1379267554548535301,"Covid-19 Update!\n\nAt Blush, we are still committed to everyone's safety.\n\nWearing face masks and practicing social distancing while in the salon are still required. \n\nWe sure do hope that we can see your smiling faces soon!!😀\n\nThank you for your understanding &amp; cooperation. 💗",2021-04-06 02:59:45,https://pbs.twimg.com/media/EyQkh7fXEAAoI_w.jpg
1,1379267544536743939,"Fox Business host Kennedy on Monday spoke with Mediaite’s own Colby Hall about the backlash CBS News and 60 Minutes faced for their story over the weekend hitting Florida Gov. Ron DeSantis over vaccine distribution.\n“Ah, the lefty media. #60Minutes\n\n",2021-04-06 02:59:42,https://pbs.twimg.com/media/EyQkhacWEAEpFqz.png
2,1379267534042578944,"We can help you achieve your goals, even from home!\n\nLearn more: \n\n#fitnessgoals #homegym #goals",2021-04-06 02:59:40,https://pbs.twimg.com/media/EyQkg0tWEAExi-s.jpg
3,1379267533874806785,#CCPVirus_Joke\n\nThanks Salty Cracker @user #MAGA\n \n#CoronaVirus #WuhanVirus #COVID19 \n\nqt-covid19-jokes-072 #BorderObserver,2021-04-06 02:59:40,https://pbs.twimg.com/media/EyQkg0-WQAIyUJS.jpg
4,1379267528057356288,"This is not a #CCPVirus_Joke, but funny\n\nIs your #meth contaminated with #coronavirus? This Florida police dept. will test it for free \nMar 4, 2020\n \n\n#methamphetamine #COVID19 #CCPVirus\n\nqt-covid19-jokes-052 #BorderObserver",2021-04-06 02:59:38,https://pbs.twimg.com/media/ESRjgacXcAIzfFn.jpg


In [14]:
#let's see if any duplicates were removed, i.e. did the no. of rows decrease?
tweet_df.shape

(896, 4)

In [15]:
#saves the dataframe to .xlsx file without a column for row indexing
#careful not to overwrite an existing file!
tweet_df.to_excel('covid-4.xlsx', index=False)  