<a href="https://colab.research.google.com/github/Sofia-1234567/Forensic-Research-Project-2021/blob/main/Tweets_Collection_for_FRP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Twitter properties file contains the Twitter accesstoken, accesstokensecret,apikey and apisecretkey. 
!ls '/content/drive/My Drive/Colab Notebooks/twitterproperties.txt'

In [None]:
#Tweepy is a Python library that allows us to access Twitter API. 
!pip install tweepy
import tweepy as tw
print(tw.__version__) #this will print the ver. of tweepy that has been imported

#We will use the pandas library (a library for data manipulation and analysis) to build a dataframe for the collected tweets
import pandas as pd

In [None]:
#We will need the configparser package for reading the properties file from the Google Drive
!pip install ConfigParser

In [None]:
#In the properties file, the tokens and secret keys that will allow us to access the Twitter API are under a section called 'Twitter' 
import pandas as pd
import configparser
config = configparser.RawConfigParser()
config.read('/content/drive/My Drive/Colab Notebooks/twitterproperties.txt')

print(config.sections());

In [None]:
#Assign the contents of the Twitter section of the properties file to the the token and secret key variables. 
accesstoken=config.get('twitter', 'accesstoken')
accesstokensecret=config.get('twitter', 'accesstokensecret')
apikey=config.get('twitter', 'apikey')
apisecretkey=config.get('twitter', 'apisecretkey')

In [None]:
#Tweepy AuthHandler authorises the keys and tokens to be accessed 
auth = tw.OAuthHandler(apikey, apisecretkey)
auth.set_access_token(accesstoken, accesstokensecret)

#calling on auth and applying a wait limit so the server is not constantly pulling tweets (prevents crashing)
#when the treshold of collected tweets is met, it will refresh in another 15 minutes to pull the next set of tweets
api = tw.API(auth, wait_on_rate_limit=True)


In [None]:
#searches for words and phrases (inside "") in one tweet using the OR and AND functions
#search also includes queries for filtering out retweets, and filtering based on username and whether it includes an image
search_words = '' '-filter:retweets AND from:user AND filter:twimg'

#specify date we want the tweets from
date_since = "2020-01-01"

#tells the tweepy cursor we want to perform an API search of search_words, with language of the tweets being english, and to pull n tweets (items) at a time. 
tweets = tw.Cursor(api.search,
                   q=search_words,
                   lang="en",
                   since=date_since, tweet_mode='extended', include_entities=True).items(1000) 

In [None]:
#the items that will be extracted per tweet are: unique tweet identifier (as an integer), the full text of the tweet, the date tweet was created, and the URL to the image in attached to the tweet 
tweet_details = [[tweet.id, tweet.user.screen_name, tweet.full_text, tweet.created_at, tweet.entities['media'][0]['media_url']] for tweet in tweets]

In [None]:
#creating a dataframe of tweet_details
#column headings are: 
tweet_df = pd.DataFrame(data=tweet_details, columns=['tweet_id','user', 'tweet_txt', 'created_at', 'tweet_img_URL'])

#sets the maximum width in characters of a column. When the column overflows, a "..." placeholder is embedded in the output. 
pd.set_option('max_colwidth', 400)

In [None]:
#filters the dataframe to only include rows containing the string "pbs.twimg.com/media" i.e. this will filter out entries linking to GIFs and video thumbnail
tweet_df = tweet_df[tweet_df['tweet_img_URL'].str.contains("pbs.twimg.com/media")]

#this will show us the first 5 rows/entries in the dataframe
tweet_df.head(5)

In [None]:
#this will give us the dimensions of the dataframe, i.e. how many rows and how many columns --> output is: (no. of rows, no. of columns)
tweet_df.shape

In [None]:
#dropping duplicate values in column "tweet_txt"

#first, lets do some simple cleaning. Here, @ mentions are replaced with "@user", and https links are replaced with nothing (i.e. they are removed).
import re
def clean_tweets(Tweet):
    Tweet = re.sub("@[\w]*","@user",Tweet)
    Tweet = re.sub("https?://[A-Za-z0-9./]*","",Tweet)
    return Tweet

tweet_df['tweet_txt']=tweet_df['tweet_txt'].apply(lambda x: clean_tweets(x))

#drop duplicate rows and keep the first instance
tweet_df = tweet_df.drop_duplicates(subset=['tweet_txt'], keep='first')

tweet_df.head(16)

In [None]:
#let's see if any duplicates were removed, i.e. did the no. of rows decrease?
tweet_df.shape

In [None]:
#saves the dataframe to .xlsx file without a column for row indexing
#careful not to overwrite an existing file!
#tweet_df.to_excel('BLM-6-neutral.xlsx', index=False)  