# Sampling

In [1]:
%matplotlib inline

import os
import csv
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from datetime import datetime

os.chdir('../..')

from src.utils.db_manager import *
from src.utils.data_wrangler import *
from src.analyzer.data_analyzer import *

plt.style.use('seaborn')

In [276]:
pd.set_option('display.max_colwidth', 280)

## Internas 2017

In [2]:
dbm = DBManager(collection='tweets', db_name='internas17')

### Seleccionar tweets

Abajo se seleccionan tweets que cumplan con las siguientes condiciones: son relevantes, relacionados a la ANR, que contengan al menos 1 hashtag y no RTs.

In [280]:
filter_query = {
    'relevante': 1,
    'flag.partido_politico.anr': {'$gt': 0}, 
    'tweet_obj.type': {'$ne': 'retweet'},
    'tweet_obj.entities.hashtags.0': {'$exists': 1}
}
fields_to_retrieve = {
    '_id': 0,
    'tweet_obj.id_str': 1,
    'tweet_py_datetime': 1,
    'tweet_obj.user.screen_name': 1,
    'tweet_obj.complete_text': 1,
    'tweet_obj.type': 1,
    'tweet_obj.retweet_count': 1,
    'tweet_obj.favorite_count': 1,
    'tweet_obj.entities.hashtags': 1
}
data = dbm.get_tweets_reduced(filter_query, fields_to_retrieve)
tweets_df = pd.DataFrame(data)

In [281]:
print('Se seleccionaron {:,} tweets'.format(tweets_df.shape[0]))

Se seleccionaron 8,601 tweets


#### Renombrar columnas

In [282]:
tweets_df = tweets_df.rename(columns={
    'tweet_obj_id_str': 'id', 
    'tweet_py_datetime': 'datetime',
    'tweet_obj_user': 'screen_name',
    'tweet_obj_complete_text': 'text',
    'tweet_obj_type': 'type',
    'tweet_obj_retweet_count': 'retweet_count',
    'tweet_obj_favorite_count': 'favorite_count',
    'tweet_obj_entities': 'hashtags'
})

#### Procesar columna screen_name

In [283]:
tweets_df['screen_name'] = tweets_df['screen_name'].apply(lambda x: x['screen_name'])

#### Procesar columna hashtags

In [284]:
def process_hashtags(h_dict):
    try:
        hashtags_list = h_dict['hashtags']
        hashtags = []
        for hashtag in hashtags_list:
            hashtags.append(hashtag['text'])
        return ','.join(hashtags)
    except:
        print(h_dict)

In [285]:
tweets_df['hashtags'] = tweets_df['hashtags'].apply(process_hashtags)

#### Reordenar columnas

In [286]:
new_order = [3,7,5,6,0,4,2,1]
tweets_df = tweets_df[tweets_df.columns[new_order]]

#### Desplegar muestra

In [287]:
tweets_df.head()

Unnamed: 0,id,datetime,type,screen_name,text,retweet_count,favorite_count,hashtags
0,937410405462368257,12/03/17 16:57:11,reply,RosaSanchezCru5,@caraocruzpy @SantiPenap #SantiPresidente #GanaLista2,0,1,"SantiPresidente,GanaLista2"
1,937406834473930752,12/03/17 16:43:00,reply,ceciliagales23,@SantiPenap LA DUPLA GANADORA Santi-Gneiting #JuntosHagamosM√°s,0,0,JuntosHagamosM√°s
2,937480342826553346,12/03/17 21:35:05,reply,Rocio_La_Negra,@caraocruzpy @jtorresromero @clariarias @marianpinedapy @Unicanal @SantiPenap A no aflojar #JuntosHacemosMas,0,0,JuntosHacemosMas
3,936562871902851072,12/01/17 08:49:23,reply,carlosave16,@SantiPenap #HonorColorado #SantiPresidente #GanaLista2,0,0,"HonorColorado,SantiPresidente,GanaLista2"
4,936634021349593089,12/01/17 13:32:07,original,PrensaSanti2018,#CaminandoJuntos @SantiPenap visit√≥ esta ma√±ana casa por casa a los vecinos del Barrio Tacumb√∫ de Asunci√≥n.\nüìç¬°Esta‚Ä¶ https://t.co/BnD556vK3k,12,48,CaminandoJuntos


### Filtrar tweets

Filtrar tweets cuyos hashtags no correspondan a los hashtagas de campa√±a

In [289]:
import unicodedata
###
# Remove non-ascii characters and put hashtag to lower case
###
def process_hashtag(hashtag):
    hashtag = unicodedata.normalize('NFD', hashtag).encode('ascii', 'ignore').decode()
    return hashtag.lower()

In [290]:
campaign_hashtags = pd.read_csv('data/hashtags_internas.csv')
campaign_hashtags['hashtag_campaign'] = campaign_hashtags['hashtag_campaign'].apply(process_hashtag)
hashtags_list = list(campaign_hashtags['hashtag_campaign'])

In [291]:
def contain_campaign_hashtags(hashtags_str):
    hashtags = hashtags_str.split(',')
    found_campaign_ht = False
    for hashtag in hashtags:
        hashtag = process_hashtag(hashtag)
        if hashtag in hashtags_list:
            found_campaign_ht = True
            break
    return found_campaign_ht

Crear una columna auxiliar para indicar los tweets que contienen los hashtags de campa√±a

In [292]:
tweets_df['contain_campaign_hashtags'] = tweets_df['hashtags'].apply(contain_campaign_hashtags)

In [293]:
tweets_df = tweets_df[tweets_df['contain_campaign_hashtags']==True]

Eliminar columna auxiliar

In [294]:
tweets_df = tweets_df.drop(['contain_campaign_hashtags'], axis=1)

In [295]:
tweets_df.shape

(3036, 8)

### Elegir muestra aleatoria

El tama√±o de la muestra fue seleccionado considerando un marjen de error de 5%, un nivel de confianza de 95%, y la cantidad de filas en `tweets_df`. Para el c√°lculo, se utiliz√≥ la siguiente [calculadora](https://www.surveymonkey.com/mp/sample-size-calculator/).

In [296]:
SAMPLE_SIZE=368

In [270]:
try:
    annotated_sample = pd.read_csv('datasets/dataframes_internas/tweets_anotados_internas.csv', dtype={'id': object})
    # remove rows with na ids
    annotated_sample = annotated_sample[annotated_sample['id'].notna()]
except:
    annotated_sample = None

In [301]:
if annotated_sample is None:
    sample_df = tweets_df.sample(n=SAMPLE_SIZE, random_state=1)
else:
    aux_tweets_df = tweets_df[~tweets_df['id'].isin(annotated_sample['id'])]
    sample_df = tweets_df.sample(n=(SAMPLE_SIZE-annotated_sample.shape[0]), random_state=1)

In [306]:
sample_df = sample_df.drop(['hashtags'], axis=1)

In [307]:
print('Tama√±o sample: {} registros'.format(sample_df.shape[0]))

Tama√±o sample: 203 registros


In [308]:
sample_df.head()

Unnamed: 0,id,datetime,type,screen_name,text,retweet_count,favorite_count
7,937349483406512129,12/03/17 12:55:06,original,victorbogadopy,Agradezco a Eduardo Palacios vicepresidente de la Seccional 11 y a todo el equipo de la #Lista2 #SantiPresidente‚Ä¶ https://t.co/PmQHwsUox2,2,8
1929,941026222515146753,12/13/17 16:25:09,original,PraxedesReynoso,El 17 de diciembre vot√° por un #ParaguayDeLaGente! con @MaritoAbdo\n#Vot√°Lista3,0,0
127,937144015258087426,12/02/17 23:18:39,reply,MilciadesJoseG,@SantiPenap #SantiPresidente #EstoyConSanti #JuntosHagamosM√°s,0,0
1374,937992603768971264,12/05/17 07:30:38,reply,sebastiansosa80,@SantiPenap 12 D√≠as para la victoria! #SantiPe√±aPresidente #JuntosHagamosM√°s,0,0
382,938699956852084736,12/07/17 06:21:24,original,PrensaSanti2018,üì∫ @SantiPenap estar√° hoy en @CadaDiaSNT con @yolandapark1 y @TrocheCarlos por el @sntcanal9 \n#SigamosConversando https://t.co/3fVTd7WCLl,13,39


### Exportar muestra

In [309]:
if annotated_sample is None:
    sample_df.to_csv('datasets/dataframes_internas/sample_internas.csv', index=False)
else:
    sample_df.to_csv('datasets/dataframes_internas/sample_internas_{}.csv'.format(sample_df.shape[0]), index=False)

---

## Generales 2018

In [310]:
dbm = DBManager(collection='tweets', db_name='generales2018')

### Seleccionar tweets

Abajo se seleccionan tweets que cumplan con las siguientes condiciones: son relevantes, que contengan al menos 1 hashtag y no RTs.

In [333]:
filter_query = {
    'relevante': 1,
    'tweet_obj.type': {'$ne': 'retweet'},
    'tweet_obj.entities.hashtags.0': {'$exists': 1}
}
fields_to_retrieve = {
    '_id': 0,
    'tweet_obj.id_str': 1,
    'tweet_py_date': 1,
    'tweet_obj.user.screen_name': 1,
    'tweet_obj.complete_text': 1,
    'tweet_obj.type': 1,
    'tweet_obj.entities.hashtags': 1
}
data = dbm.get_tweets_reduced(filter_query, fields_to_retrieve)
tweets_df = pd.DataFrame(data)

In [321]:
print('Se seleccionaron {:,} tweets'.format(tweets_df.shape[0]))

Se seleccionaron 13,192 tweets


#### Renombrar columnas

In [335]:
tweets_df = tweets_df.rename(columns={
    'tweet_obj_id_str': 'id', 
    'tweet_py_date': 'date',
    'tweet_obj_user': 'screen_name',
    'tweet_obj_complete_text': 'text',
    'tweet_obj_type': 'type',
    'tweet_obj_entities': 'hashtags'
})

#### Procesar columna screen_name

In [337]:
tweets_df['screen_name'] = tweets_df['screen_name'].apply(lambda x: x['screen_name'])

#### Procesar columna hashtags

In [338]:
def process_hashtags(h_dict):
    try:
        hashtags_list = h_dict['hashtags']
        hashtags = []
        for hashtag in hashtags_list:
            hashtags.append(hashtag['text'])
        return ','.join(hashtags)
    except:
        print(h_dict)

In [339]:
tweets_df['hashtags'] = tweets_df['hashtags'].apply(process_hashtags)

#### Reordenar columnas

In [341]:
new_order = [2,5,3,4,0,1]
tweets_df = tweets_df[tweets_df.columns[new_order]]

#### Desplegar muestra

In [342]:
tweets_df.head()

Unnamed: 0,id,date,type,screen_name,text,hashtags
0,977568574146523144,03/24/18,original,leitorubin,"¬°Ya nos encontramos en el estadio Cerrito en Coronel Oviedo! Con la #Lista5 de la @AlianzaGanar, los ovetenses saben que #ElCambioEsPosible üáµüáæ https://t.co/tehKjxBhuF","Lista5,ElCambioEsPosible"
1,977557663469973504,03/24/18,original,SantacruzFISA,"#Marito no quiere debatir con los j√≥venes, @EfrainAlegre s√≠ propone educaci√≥n p√∫blica de calidad, 22 de Abril, vot√° LISTA5‚É£üñê @AlianzaGanar https://t.co/ORRBZF4rZe",Marito
2,977363949921406976,03/23/18,quote,info_paraguay,@FrenteGuasuPY @lugo_py @InfoLugo18 @SixtoPereira @HugoRicher_ @carlosfilizzola @esperanza_py @luipaciello40 @mercedescanese #Lista40 #FrenteGuasu #ElEquipoDelPueblo @EfrainAlegre @AlianzaGanar #Lista5 https://t.co/yLi3qdiyb9,"Lista40,FrenteGuasu,ElEquipoDelPueblo,Lista5"
3,977288363513466880,03/23/18,original,fran_rat,Yo eleg√≠ al candidato ganador con propuestas inclusivas para el pueblo y por sobre todo para la juventud üí™üí™üí™üëè @EfrainAlegre\n@AlianzaGanar #Lista5\nhttps://t.co/kkEJfnNvaJ,Lista5
4,977201300453588992,03/23/18,original,pdpprogresista,#Autoblindaje @Horacio_Cartes @JuanAfara @AllianaPedro @MaritoAbdo @ANRParaguay @ANRParaguay @PLRAOficial @bancada_b @EfrainAlegre @leitorubin @AlianzaGanar @patriaquerida @DipAvanzaPais @FrenteGuasuPY @avancemos_pais @ABCCardinal @UHPolitica @5diasPY @AQEpy https://t.co/6JrT...,Autoblindaje


### Filtrar tweets

Filtrar tweets cuyos hashtags no correspondan a los hashtags de campa√±a

In [343]:
import unicodedata
###
# Remove non-ascii characters and put hashtag to lower case
###
def process_hashtag(hashtag):
    hashtag = unicodedata.normalize('NFD', hashtag).encode('ascii', 'ignore').decode()
    return hashtag.lower()

In [345]:
campaign_hashtags = pd.read_csv('data/hashtags_generales.csv')
campaign_hashtags['hashtag_campaign'] = campaign_hashtags['hashtag_campaign'].apply(process_hashtag)
hashtags_list = list(campaign_hashtags['hashtag_campaign'])

In [344]:
def contain_campaign_hashtags(hashtags_str):
    hashtags = hashtags_str.split(',')
    found_campaign_ht = False
    for hashtag in hashtags:
        hashtag = process_hashtag(hashtag)
        if hashtag in hashtags_list:
            found_campaign_ht = True
            break
    return found_campaign_ht

Crear una columna auxiliar para indicar los tweets que contienen los hashtags de campa√±a

In [346]:
tweets_df['contain_campaign_hashtags'] = tweets_df['hashtags'].apply(contain_campaign_hashtags)

In [347]:
tweets_df = tweets_df[tweets_df['contain_campaign_hashtags']==True]

Eliminar columna auxiliar

In [348]:
tweets_df = tweets_df.drop(['contain_campaign_hashtags'], axis=1)

In [349]:
tweets_df.shape

(6332, 6)

### Elegir muestra aleatoria

El tama√±o de la muestra fue seleccionado considerando un marjen de error de 5%, un nivel de confianza de 95%, y la cantidad de filas en `tweets_df`. Para el c√°lculo, se utiliz√≥ la siguiente [calculadora](https://www.surveymonkey.com/mp/sample-size-calculator/).

In [350]:
SAMPLE_SIZE=374

In [355]:
try:
    annotated_sample = pd.read_csv('datasets/dataframes_generales/tweets_anotados_generales.csv', 
                                   dtype={'id': object})
    # remove rows with na ids
    annotated_sample = annotated_sample[annotated_sample['id'].notna()]
except:
    annotated_sample = None

In [356]:
if annotated_sample is None:
    sample_df = tweets_df.sample(n=SAMPLE_SIZE, random_state=1)
else:
    aux_tweets_df = tweets_df[~tweets_df['id'].isin(annotated_sample['id'])]
    sample_df = tweets_df.sample(n=(SAMPLE_SIZE-annotated_sample.shape[0]), random_state=1)

In [357]:
sample_df = sample_df.drop(['hashtags'], axis=1)

In [358]:
print('Tama√±o sample: {} registros'.format(sample_df.shape[0]))

Tama√±o sample: 171 registros


In [359]:
sample_df.head()

Unnamed: 0,id,date,type,screen_name,text
1883,976851277413003266,03/22/18,original,juliethaguirr_M,#ParaguaySeMueve #MaritoDeLaGente .@5diasPY .@paraguaycom .@Unicanal .@RadioLibrePy .@Py2018 .@ArnoldoWiens .@MarioVarelaGob .@SenadoresPy .@julioullon https://t.co/FOk61Urotg
11654,987832841559044096,04/21/18,original,HernanPereyraOk,#MaritoDeLaGente #El22GanaMarito #ParaguaySeMueve \nhttps://t.co/cj20kGZMYb v√≠a @YouTube
5184,984084377436082184,04/11/18,original,RkMhf2SJBrN5N7o,‚òÖ #ParaguaySeMueve con @MaritoAbdo‚òÖ @HernandariasPy @ANRParaguay @AllianaPedro @Carmen2016A @LuisCastiglioni @zacariasirun @ANR_Redes‚òÖ\nMasivo apoyo a la Lista 1 en acto de cierre de campa√±a en Asunci√≥n https://t.co/ccekI502Ce
241,977752274717040641,03/24/18,original,elciraram0s,@Maritoabdo con las mujeres guerreras de Paraguay #ParaguaySeMueve #MaritoDeLaGente .@CardinalAM .@News_CC .@rnpy920am .@ParaguayTVHD .@ParaguayTVHD .@780AM https://t.co/4pSlUoEgAW
1840,976926110540881921,03/22/18,original,EDITHVILLA3,#ParaguaySeMueve con #MaritoDeLaGente https://t.co/Kk2C7G9xmf


### Exportar muestra

In [360]:
if annotated_sample is None:
    sample_df.to_csv('datasets/dataframes_generales/sample_generales.csv', index=False)
else:
    sample_df.to_csv('datasets/dataframes_generales/sample_generales_{}.csv'.format(sample_df.shape[0]), index=False)