In [1]:
#import libraries pandas and regex
import pandas as pd
import re


In [2]:
#read csv file
df=pd.read_csv("cleaned_Spanish_tweets_rumor_ivermectin.csv")

In [3]:
# column is messy; goal: clean it up and find the average of all four coordinates to get to a point coordinate
df.coordinates

0      [[[-70.6243463, -33.5101485], [-70.5780912, -3...
1      [[[-98.289206, 18.83765], [-98.019327, 18.8376...
2      [[[-98.289206, 18.83765], [-98.019327, 18.8376...
3      [[[-75.729870997496, 4.96440200361095], [-75.3...
4      [[[-66.8972931, 10.4268198], [-66.8206344, 10....
                             ...                        
869    [[[-68.0339966, 10.1125402], [-67.9569168, 10....
870    [[[-68.0339966, 10.1125402], [-67.9569168, 10....
871    [[[-62.33948, -38.789397], [-62.17072, -38.789...
872    [[[-84.3100508, 33.776522], [-84.262886, 33.77...
873    [[[-99.1919955, 19.357102], [-99.1309649, 19.3...
Name: coordinates, Length: 874, dtype: object

In [4]:
#extract first element of the coordinate string
df["coord1"]=df.coordinates.str.extract(r'(-\d+.\d+, -?\d+.\d+)')

In [5]:
# second can be extract like this
# df["coord2"]=df.coordinates.str.extract(r'( \[-\d+.\d+, -?\d+.\d+)').rename(columns={0:"coord2"})

In [6]:
# but to find the third element, we would extract 2nd and third together first and then remove one by one. Otherwise,
# it wouldn't be easy to extract the third element because its structure matches two other columns.
df['coord_2plus_3']=df.coordinates.str.extract(r'(-\d+.\d+, -?\d+.\d+\], \[-\d+.\d+, -?\d+.\d+\])')

In [7]:
# create a column with second element
df['coord2']=df.coord_2plus_3.str.extract(r'(-\d+.\d+, -?\d+.\d+\])')

In [8]:
# do the same for the third
df["coord3"]=df.coord_2plus_3.str.extract(r'(\[-\d+.\d+, -?\d+.\d+\])')

In [9]:
# because of the unique structure of the fourth, we would extract it like this.
df['coord4']=df.coordinates.str.extract(r'(-\d+.\d+, -?\d+.\d+\]])')
# .rename(columns={0:"coord2"})

In [10]:
#delete coord_2plus_3 column because it was a dummy column so that we can extract element third in the string.
del df["coord_2plus_3"]

In [11]:
#before we start splitting the columns let us remove characters
df['coord4']=df.coord4.str.replace("]]", "")
df['coord3']=df.coord3.str.replace('[', '').str.replace(']', '')
df['coord2']=df.coord2.str.replace(']', '')

In [12]:
# coordinates mean that we have lat and lang, but we need separate average that means we have to divide the columns into two
df['coord_1y']=df.coord1.str.extract(r'(-\d+.\d+)').astype(float)
df['coord_2y']=df.coord2.str.extract(r'(-\d+.\d+)').astype(float)
df['coord_3y']=df.coord3.str.extract(r'(-\d+.\d+)').astype(float)
df['coord_4y']=df.coord4.str.extract(r'(-\d+.\d+)').astype(float)

In [13]:
#extract second coordinate and create a column for each
df["coord_1x"]=df.coord1.str.extract(r'(, -?\d+.\d+)')
df["coord_1x"]= df.coord_1x.str.replace(", ", "").astype(float)

df["coord_2x"]=df.coord2.str.extract(r'(, -?\d+.\d+)')
df["coord_2x"]= df.coord_2x.str.replace(", ", "").astype(float)

df["coord_3x"]=df.coord3.str.extract(r'(, -?\d+.\d+)')
df["coord_3x"]= df.coord_3x.str.replace(", ", "").astype(float)

df["coord_4x"]=df.coord4.str.extract(r'(, -?\d+.\d+)')
df["coord_4x"]= df.coord_4x.str.replace(", ", "").astype(float)

In [14]:
#check datatypes before we run averages
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874 entries, 0 to 873
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   time          874 non-null    object 
 1   tweet_id      874 non-null    int64  
 2   tweet_text    874 non-null    object 
 3   user_id       874 non-null    int64  
 4   location      736 non-null    object 
 5   followers     874 non-null    int64  
 6   retweets      874 non-null    int64  
 7   country_code  873 non-null    object 
 8   coordinates   873 non-null    object 
 9   coord1        873 non-null    object 
 10  coord2        873 non-null    object 
 11  coord3        873 non-null    object 
 12  coord4        873 non-null    object 
 13  coord_1y      873 non-null    float64
 14  coord_2y      873 non-null    float64
 15  coord_3y      873 non-null    float64
 16  coord_4y      873 non-null    float64
 17  coord_1x      873 non-null    float64
 18  coord_2x      873 non-null    

In [15]:
#average y coordinates and x coordinates
# check whether averages are close enough to original values, if yes then we are good.
df["avg_y_coordinate"] = (df.coord_1y+df.coord_2y+df.coord_3y+df.coord_4y)/4
df["avg_x_coordinate"] = (df.coord_1x+df.coord_2x+df.coord_3x+df.coord_4x)/4

In [16]:
df.head()

Unnamed: 0,time,tweet_id,tweet_text,user_id,location,followers,retweets,country_code,coordinates,coord1,...,coord_1y,coord_2y,coord_3y,coord_4y,coord_1x,coord_2x,coord_3x,coord_4x,avg_y_coordinate,avg_x_coordinate
0,Fri Apr 03 23:45:00 +0000 2020,1246222199633203200,ivermectina aún en estudio httpstcojkwkleuzy6,888938190,"Santiago, Metropolitana de Santiago",134,0,CL,"[[[-70.6243463, -33.5101485], [-70.5780912, -3...","-70.6243463, -33.5101485",...,-70.624346,-70.624346,-70.578091,-70.624346,-33.510148,-33.510148,-33.510148,-33.469745,-70.612783,-33.500048
1,Sat Apr 04 01:02:17 +0000 2020,1246241648893464600,gisellemontes18 dearsensseman busca esto iverm...,1243701399797616600,"Puebla, México",0,0,MX,"[[[-98.289206, 18.83765], [-98.019327, 18.8376...","-98.289206, 18.83765",...,-98.289206,-98.289206,-98.019327,-98.289206,18.83765,18.83765,18.83765,19.226809,-98.221736,18.93494
2,Sat Apr 04 01:00:12 +0000 2020,1246241124458602500,debryanshow itsgrecia busquen ivermectin según...,1243701399797616600,"Puebla, México",0,0,MX,"[[[-98.289206, 18.83765], [-98.019327, 18.8376...","-98.289206, 18.83765",...,-98.289206,-98.289206,-98.019327,-98.289206,18.83765,18.83765,18.83765,19.226809,-98.221736,18.93494
3,Sat Apr 04 04:07:49 +0000 2020,1246288336530157600,ivermectina el fármaco que acaba con el corona...,246853720,Bogota - Manizales,736,0,CO,"[[[-75.729870997496, 4.96440200361095], [-75.3...","-75.729870997496, 4.96440200361095",...,-75.729871,-75.729871,-75.390699,-75.729871,4.964402,4.964402,4.964402,5.149466,-75.645078,5.010668
4,Sat Apr 04 11:47:09 +0000 2020,1246403934559907800,🔴atención descubren un fármaco que podría aca...,161171006,+584143227648 Venezuela.,8457,38,VE,"[[[-66.8972931, 10.4268198], [-66.8206344, 10....","-66.8972931, 10.4268198",...,-66.897293,-66.897293,-66.820634,-66.897293,10.42682,10.42682,10.42682,10.489041,-66.878128,10.442375


In [17]:
df.keys()

Index(['time', 'tweet_id', 'tweet_text', 'user_id', 'location', 'followers',
       'retweets', 'country_code', 'coordinates', 'coord1', 'coord2', 'coord3',
       'coord4', 'coord_1y', 'coord_2y', 'coord_3y', 'coord_4y', 'coord_1x',
       'coord_2x', 'coord_3x', 'coord_4x', 'avg_y_coordinate',
       'avg_x_coordinate'],
      dtype='object')

In [18]:
# remove some columns that we don't need for visualization
cleaned_df=df[['time', 'tweet_id', 'tweet_text', 'user_id', 'location', 'followers',
       'retweets', 'country_code', 'avg_y_coordinate',
       'avg_x_coordinate']]

In [19]:
cleaned_df.head()

Unnamed: 0,time,tweet_id,tweet_text,user_id,location,followers,retweets,country_code,avg_y_coordinate,avg_x_coordinate
0,Fri Apr 03 23:45:00 +0000 2020,1246222199633203200,ivermectina aún en estudio httpstcojkwkleuzy6,888938190,"Santiago, Metropolitana de Santiago",134,0,CL,-70.612783,-33.500048
1,Sat Apr 04 01:02:17 +0000 2020,1246241648893464600,gisellemontes18 dearsensseman busca esto iverm...,1243701399797616600,"Puebla, México",0,0,MX,-98.221736,18.93494
2,Sat Apr 04 01:00:12 +0000 2020,1246241124458602500,debryanshow itsgrecia busquen ivermectin según...,1243701399797616600,"Puebla, México",0,0,MX,-98.221736,18.93494
3,Sat Apr 04 04:07:49 +0000 2020,1246288336530157600,ivermectina el fármaco que acaba con el corona...,246853720,Bogota - Manizales,736,0,CO,-75.645078,5.010668
4,Sat Apr 04 11:47:09 +0000 2020,1246403934559907800,🔴atención descubren un fármaco que podría aca...,161171006,+584143227648 Venezuela.,8457,38,VE,-66.878128,10.442375


In [20]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874 entries, 0 to 873
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              874 non-null    object 
 1   tweet_id          874 non-null    int64  
 2   tweet_text        874 non-null    object 
 3   user_id           874 non-null    int64  
 4   location          736 non-null    object 
 5   followers         874 non-null    int64  
 6   retweets          874 non-null    int64  
 7   country_code      873 non-null    object 
 8   avg_y_coordinate  873 non-null    float64
 9   avg_x_coordinate  873 non-null    float64
dtypes: float64(2), int64(4), object(4)
memory usage: 68.4+ KB
