## STEP 1: PYTHON PACKAGES INSTALLATION

Install the following python packages that will help you to collect data from twiter.com 

In [None]:
#!pip install tweepy 

In [None]:
# !pip install unidecode

## STEP 2: IMPORT IMPORTANT PACKAGES 

In [1]:
#import dependencies
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
from unidecode import unidecode
import time
import datetime
from tqdm import tqdm 
import pandas as pd  
import numpy as np 

## STEP 3:AUTHENTICATING TO Twitter’s API

(a) First, apply for a developer account to access the API. The Standard APIs are sufficient for this tutorial. They’re free, but have some limitations that we’ll learn to work around in this tutorial.

Click here to apply: [apply for developer account to acces the API](https://developer.twitter.com/en/apply-for-access)

(b) Once your developer account is setup, create an app that will make use of the API by clicking on your username in the top right corner to open the drop down menu, and clicking “Apps” as shown below. Then select “Create an app” and fill out the form. 

(c) Now that you have created a developer account and an app, you should have a set of keys to connect to the Twitter API. Specifically, you’ll have an
- API key
- API secret key
- Access token
- Access token secret

These could be inserted directly into your code to connect to the Twitter API, as shown below.

In [2]:
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

## STEP 4:  CONNECT TO TWITTER API USING THE SECRETS

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

## STEP 5: DEFINE A FUNCTION THAT WILL TAKE OUR SEARCH QUERY

In [4]:
def tweetSearch(query, limit):
    """
    This function will search a query provided in the twitter and,
    retun a list of all tweets that have a query. 
    """

    # Create a blank variable
    tweets = []

    # Iterate through Twitter using Tweepy to find our query with our defined limit
    for page in tweepy.Cursor(
        api.search, q=query, count=limit, tweet_mode="extended"
    ).pages(limit):
        for tweet in page:
            tweets.append(tweet)

    # return tweets
    return tweets

## STEP 6: CREATE A FUNCTION TO SAVE TWEETS INTO A DATAFRAME

In [5]:
def tweets_to_data_frame(tweets):
    """
    This function will receive tweets and collect specific data from it such as place, tweet's text,likes 
    retweets and save them into a pandas data frame.
    
    This function will return a pandas data frame that contains data from twitter.
    """
    df = pd.DataFrame(data=[tweet.full_text.encode('utf-8') for tweet in tweets], columns=["Tweets"])

    df["id"] = np.array([tweet.id for tweet in tweets])
    df["lens"] = np.array([len(tweet.full_text) for tweet in tweets])
    df["date"] = np.array([tweet.created_at for tweet in tweets])
    df["place"] = np.array([tweet.place for tweet in tweets])
    df["coordinateS"] = np.array([tweet.coordinates for tweet in tweets])
    df["lang"] = np.array([tweet.lang for tweet in tweets])
    df["source"] = np.array([tweet.source for tweet in tweets])
    df["likes"] = np.array([tweet.favorite_count for tweet in tweets])
    df["retweets"] = np.array([tweet.retweet_count for tweet in tweets])

    return df

## STEP 7: ADD TWITTER HASHTAGS RELATED TO GENDER BASED VIOLENCE

In [6]:
# add hashtags in the following list
hashtags = [
'#GBV',
'#sexism',
'#rape'    
]

## STEP 8: RUN BOTH FUNCTIONS TO COLLECT DATA FROM TWITTER RELATED TO THE HASHTAGS LISTED ABOVE

In [7]:
total_tweets = 0

"""
The following for loop will collect a tweets that have the hashtags
 mentioned in the list and save the tweets into csv file
"""

for n in tqdm(hashtags):
    # first we fetch all tweets that have specific hashtag
    hash_tweets = tweetSearch(query=n,limit=7000)
    total_tweets += int(len(hash_tweets))
    
    # second we convert our tweets into datarame
    df = tweets_to_data_frame(hash_tweets)
    
    #third we save the dataframe into csv file
    df.to_csv("{}_tweets.csv".format(n))

100%|██████████| 3/3 [05:30<00:00, 110.23s/it]


In [8]:
# show total number of tweets collected
print("total_tweets: {}".format(total_tweets))

total_tweets: 9158


In [18]:
data_rape = pd.read_csv('#rape_tweets.csv')
data_rape

Unnamed: 0.1,Unnamed: 0,Tweets,id,lens,date,place,coordinateS,lang,source,likes,retweets
0,0,"b'RT @KaceyKells: KELLCEY ""This story not only...",1389684729218093058,140,2021-05-04 20:53:53,,,en,Twitter Web App,0,10284
1,1,b'Had Twitter for like a month and still dk ho...,1389684562792484868,67,2021-05-04 20:53:13,,,en,Twitter for iPhone,0,0
2,2,b'RT @PixelProject: #AFRICA: Abuse of women in...,1389684068271468551,132,2021-05-04 20:51:15,,,en,Twitter for Android,0,2
3,3,b'RT @CDCInjury: 97% of males who survived #ra...,1389683952424833027,140,2021-05-04 20:50:47,,,en,Twitter for iPhone,0,2
4,4,b'RT @CDCInjury: 97% of males who survived #ra...,1389683668688515072,140,2021-05-04 20:49:40,,,en,Twitter Web App,0,2
...,...,...,...,...,...,...,...,...,...,...,...
4952,4952,"b'RT @eliasamare: 6) The #SexualViolence, #rap...",1386847674482102272,144,2021-04-27 01:00:26,,,en,Twitter for iPhone,0,220
4953,4953,"b""RT @eliasamare: 8) Again, here's Muluberhan ...",1386847625199058946,140,2021-04-27 01:00:14,,,en,Twitter for iPhone,0,232
4954,4954,b'RT @FeministLibya: In a football match today...,1386847010288975875,139,2021-04-27 00:57:48,,,en,Twitter Web App,0,4
4955,4955,b'.\xe2\x80\x98A Tigrayan womb should never gi...,1386846185474854912,275,2021-04-27 00:54:31,,,en,Twitter Web App,0,0


In [19]:
data_rape.isnull().sum()

Unnamed: 0        0
Tweets            0
id                0
lens              0
date              0
place          4928
coordinateS    4955
lang              0
source            0
likes             0
retweets          0
dtype: int64

In [21]:
data_sexism = pd.read_csv('#sexism_tweets.csv')
data_sexism

Unnamed: 0.1,Unnamed: 0,Tweets,id,lens,date,place,coordinateS,lang,source,likes,retweets
0,0,b'Death to #religion\nDeath to #homophobia\nDe...,1389681824906063877,169,2021-05-04 20:42:20,,,en,Twitter for Android,0,0
1,1,b'RT @DingaBelle: @_snowbunting @Flomoll \xf0\...,1389680927031369734,140,2021-05-04 20:38:46,,,en,Twitter for Android,0,3
2,2,b'RT @DingaBelle: @_snowbunting @Flomoll \xf0\...,1389676992648318980,140,2021-05-04 20:23:08,,,en,Twitter for iPhone,0,3
3,3,"b""Finally watched #SexyLamp after years of try...",1389676944782934017,229,2021-05-04 20:22:57,,,en,Twitter for Android,1,0
4,4,b'\xf0\x9f\x98\xb1\xf0\x9f\x98\xb1\xf0\x9f\x98...,1389675769522184199,302,2021-05-04 20:18:17,,,en,Twitter for Android,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1140,1140,b'Have you ever given thought to the various f...,1386855800992198668,280,2021-04-27 01:32:44,,,en,Twitter Web App,0,0
1141,1141,b'RT @SkaKeller: Very good that @vonderleyen p...,1386855523027492865,140,2021-04-27 01:31:37,,,en,Twitter for Android,0,22
1142,1142,b'How #Sexism \n\nIs #Coded Into the #Tech Ind...,1386854623391125507,289,2021-04-27 01:28:03,,,en,Buffer,24,16
1143,1143,"b'RT @ESWNtweets: ""While there were significan...",1386853519538331655,140,2021-04-27 01:23:40,,,en,Twitter for Android,0,8


In [22]:
data_sexism.isnull().sum()

Unnamed: 0        0
Tweets            0
id                0
lens              0
date              0
place          1134
coordinateS    1145
lang              0
source            0
likes             0
retweets          0
dtype: int64

In [23]:
data_GBV = pd.read_csv('#GBV_tweets.csv')

In [24]:
data_GBV

Unnamed: 0.1,Unnamed: 0,Tweets,id,lens,date,place,coordinateS,lang,source,likes,retweets
0,0,b'Can\xe2\x80\x99t wait for you kids to hear t...,1389683945143427076,136,2021-05-04 20:50:46,,,en,Twitter for iPhone,0,0
1,1,b'RT @CedawPT: Women journalists can be target...,1389683664150319113,148,2021-05-04 20:49:39,,,en,Twitter for Android,0,18
2,2,"b'RT @mobkimarkus7: Save the Girl Child, suppo...",1389681203838656516,140,2021-05-04 20:39:52,,,en,Twitter for Android,0,5
3,3,b'RT @CedawPT: Women journalists can be target...,1389681007369011209,148,2021-05-04 20:39:05,,,en,Twitter for Android,0,18
4,4,b'Important study about how how construction w...,1389680772123086849,114,2021-05-04 20:38:09,,,en,Twitter for Android,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3051,3051,b'RT @GBV_CHILE: Seccion #MTCero #GBV y @Carab...,1386885052034912260,140,2021-04-27 03:28:58,,,es,Twitter for Android,0,2
3052,3052,b'Seccion #MTCero #GBV y @Carabdechile Com. #E...,1386882452837257218,297,2021-04-27 03:18:38,,,es,Twitter for Android,2,2
3053,3053,"b'RT @JackieRange: ""If you go into our factori...",1386871901834731521,140,2021-04-27 02:36:42,,,en,Twitter for iPhone,0,2
3054,3054,b'RT @LesMichif: Only THREE days left to regis...,1386868002197426177,139,2021-04-27 02:21:13,,,en,Twitter for iPhone,0,5


In [25]:
data_GBV.isnull().sum()

Unnamed: 0        0
Tweets            0
id                0
lens              0
date              0
place          3036
coordinateS    3056
lang              0
source            0
likes             0
retweets          0
dtype: int64

In [28]:
data_GBV['lang'].value_counts()

en     2921
es       57
und      38
de       29
ne        3
nl        2
ar        1
in        1
it        1
da        1
el        1
tl        1
Name: lang, dtype: int64

In [30]:
d = data_GBV['Tweets']
d

0       b'Can\xe2\x80\x99t wait for you kids to hear t...
1       b'RT @CedawPT: Women journalists can be target...
2       b'RT @mobkimarkus7: Save the Girl Child, suppo...
3       b'RT @CedawPT: Women journalists can be target...
4       b'Important study about how how construction w...
                              ...                        
3051    b'RT @GBV_CHILE: Seccion #MTCero #GBV y @Carab...
3052    b'Seccion #MTCero #GBV y @Carabdechile Com. #E...
3053    b'RT @JackieRange: "If you go into our factori...
3054    b'RT @LesMichif: Only THREE days left to regis...
3055    b'RT @MzeeBwanika: Man beats wife and breaks h...
Name: Tweets, Length: 3056, dtype: object