## EXTRACTING DATA FROM TWITTER

In [3]:
import os
import time
import tweepy
import pandas as pd

# Defining access credentials
consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
access_token = os.getenv("TWITTER_ACCESS_TOKEN")
access_token_secret = os.getenv("TWITTER_ACCES_TOKEN_SECRET")

# Authenticate with the Twitter API.
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Create a Twitter object
api = tweepy.API(auth)

def prettytime(t:int):
    """Dada una cantidad t de segundos, lo devuelve en formato hh:mm:ss"""
    hh, mm = divmod(t, 3600)
    mm, ss = divmod(mm, 60)
    hh, mm, ss = map(lambda x: str(int(x)), [hh,mm,ss])
    return f'{hh.zfill(2)}:{mm.zfill(2)}:{ss.zfill(2)}'

def extract_tweets(query, geocode):
    # set query
    # set geocode
    buffer, tweets = [None], [None, None]
    while len(tweets) > 1:
        last = buffer.pop()
        if last is None:
            tweets = []
            max_id_ = None
        else:
            max_id_ = last[0]
        tweets = api.search_tweets(q=query, 
                                   geocode=geocode, 
                                   lang='es', 
                                   result_type='recent', 
                                   count=100, 
                                   max_id=max_id_)
        buffer.extend([tweet.id, tweet.text] for tweet in tweets)
        time.sleep(5)
    return pd.DataFrame(data=buffer, columns=['id', 'tweet'])

keywords = {
    "abuse": ['perra', 'cállate', 'sumisa', 'mujer', 'puta', 'tonta', 'florero', 'rubia', 'nenaza'],
    "hate":['mujer', 'feminismo', 'feminazi', 'daño', 'hembrismo', 'igualdad', 'todas'],
    "profanities":['estupida', 'guarra', 'zorra', 'madre', 'golfa', 'odio'],
    "violent": ['follar', 'acoso', 'sexual', 'viola', 'teta', 'niña', 'sodomizar'],
    "sexually-explicit": ['culo', 'mereces', 'polla', 'perra', 'falda', 'coño', 'gorda', 'lengua']
}

mexican_corpus = pd.DataFrame()
for typ, kwords in keywords.items():
    t0 = time.time()
    query = " OR ".join(kwords)
    data = extract_tweets(query, geocode='20.659698,-103.349609,100km')
    t1 = time.time()
    data['supposed'] = [typ] * len(data)
    mexican_corpus = pd.concat([mexican_corpus, data], axis=0)
    print(f"{len(data)} tweets extracted for '{typ}' keywords in {prettytime(t1-t0)}")
mexican_corpus

4254 tweets extracted for 'abuse' keywords in 00:04:31
6286 tweets extracted for 'hate' keywords in 00:06:20
5044 tweets extracted for 'profanities' keywords in 00:04:58
1467 tweets extracted for 'violent' keywords in 00:01:42
1989 tweets extracted for 'sexually-explicit' keywords in 00:02:50


Unnamed: 0,id,tweet,supposed
0,1651746942828048384,@danii_naranjoo @Chivas Nmms nooo cállate que ...,abuse
1,1651746630435995649,@rondinellicrist @rhdelfutbol @ArchivoFutbolAR...,abuse
2,1651746167938506752,me siento bien tonta al no saber manejar carro😔,abuse
3,1651745379342340098,@Richs__Es @elnocturno @Crash_Valentine Chinga...,abuse
4,1651745060692840449,@callodehacha Este wey muerto de hambre que le...,abuse
...,...,...,...
1984,1648214723454795776,Me vale que se me haga chicharrón el culo,sexually-explicit
1985,1648213181460881408,Hoy alguien que quiero mucho me hizo feliz\nMe...,sexually-explicit
1986,1648212831727239170,tengo miedo. Ya estaba dormida como toda una v...,sexually-explicit
1987,1648211083159650305,Wey una gran amiga de la uni tiene una sección...,sexually-explicit


In [5]:
mexican_corpus.to_csv("../data/extracted/mexican_corpus.tsv", sep="\t", index=False)