# Construcción del dataset

En la carpeta `assets` se encuentran dos archivos CSV que contienen información de tweets en Inglés con su respectiva clasificación de sentimiento. El objetivo de este notebook es construir un dataset que contenga la información de ambos archivos. Se seleccionaran de cada archivo 15000 tweets positivos, 15000 tweets negativos. Se traducirán los tweets al español y se almacenarán en un archivo CSV.

In [36]:
# Instalación de librerías
%pip install -r '../requirements.txt'

Note: you may need to restart the kernel to use updated packages.


In [37]:
# Importación de librerías
import pandas as pd
from sklearn.model_selection import train_test_split

#### Dataset 1

In [38]:
# Lectura dataset 1
header = ['id', 'entity', 'sentiment', 'text']
dataset1_no_procesado = pd.read_csv('./assets/dataset1-no-procesado.csv', delimiter=',', encoding='utf-8', names=header)
# Descrición dataset
print(dataset1_no_procesado.describe())
# Visualización primeros registros
dataset1_no_procesado.head()

                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


Unnamed: 0,id,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [39]:
# Limpieza dataset 1
dataset1_procesado = dataset1_no_procesado.dropna()
dataset1_procesado = dataset1_procesado.drop_duplicates(subset='text')
dataset1_procesado = dataset1_procesado[dataset1_procesado['text'] != '']
dataset1_procesado = dataset1_procesado[(dataset1_procesado['sentiment'] == 'Positive') |
                                        (dataset1_procesado['sentiment'] == 'Negative')]
# Descrición dataset
print(dataset1_procesado.describe())

                 id
count  40233.000000
mean    6568.930057
std     3673.562298
min        1.000000
25%     3301.000000
50%     6688.000000
75%     9592.000000
max    13198.000000


In [40]:
# Selección de columnas
dataset1_procesado = dataset1_procesado[['text', 'sentiment']]
dataset1_procesado.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


##### Dataset 2


In [41]:
# Lectura dataset 2
dataset2_no_procesado = pd.read_csv('./assets/dataset2-no-procesado.csv', delimiter=',', encoding='unicode_escape')
# Descrición dataset
print(dataset2_no_procesado.describe())
# Visualización primeros registros
dataset2_no_procesado.head()

       Population -2020  Land Area (Km²)  Density (P/Km²)
count      2.748100e+04     2.748100e+04     27481.000000
mean       4.018497e+07     6.621730e+05       357.686583
std        1.504946e+08     1.807425e+06      2013.750702
min        8.010000e+02     0.000000e+00         2.000000
25%        1.968001e+06     2.281000e+04        35.000000
50%        8.655535e+06     1.118900e+05        89.000000
75%        2.843594e+07     5.279700e+05       214.000000
max        1.439324e+09     1.637687e+07     26337.000000


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [42]:
# Limpieza dateset 2
dataset2_procesado = dataset2_no_procesado.dropna()
dataset2_procesado = dataset2_procesado[dataset2_procesado['selected_text'] != '']
dataset2_procesado = dataset2_procesado.drop_duplicates(subset='selected_text')
dataset2_procesado = dataset2_procesado[(dataset2_procesado['sentiment'] == 'positive') |
                                        (dataset2_procesado['sentiment'] == 'negative')]
# Descrición dataset
print(dataset2_procesado.describe())

       Population -2020  Land Area (Km²)  Density (P/Km²)
count      1.134900e+04     1.134900e+04     11349.000000
mean       4.013332e+07     6.596023e+05       345.328047
std        1.523600e+08     1.797945e+06      1935.935979
min        8.010000e+02     0.000000e+00         2.000000
25%        1.968001e+06     2.281000e+04        36.000000
50%        8.655535e+06     1.118900e+05        87.000000
75%        2.769102e+07     5.108900e+05       214.000000
max        1.439324e+09     1.637687e+07     26337.000000


In [43]:
# Selección de columnas
dataset2_procesado = dataset2_procesado[['selected_text', 'sentiment']]
# Renombrar columnas
dataset2_procesado.columns = ['text', 'sentiment']
dataset2_procesado.head()

Unnamed: 0,text,sentiment
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative
6,fun,positive


# Juntar los dos datasets en un solo archivo CSV

In [44]:
# Unión de datasets
dataset = pd.concat([dataset1_procesado, dataset2_procesado])
# Dejar clases en minúsculas
dataset['sentiment'] = dataset['sentiment'].str.lower()
# Descripción dataset
print(dataset['sentiment'].value_counts())

sentiment
negative    27012
positive    24570
Name: count, dtype: int64


In [45]:
# Exportación dataset 30000 registros
# train  = 80% = 24000 registros
# test = 20% = 6000 registros ->
dataset_train, dataset_other = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['sentiment'])
dataset_test, dataset_validation = train_test_split(dataset_other, test_size=0.5, random_state=42, stratify=dataset_other['sentiment'])
dataset_train.to_csv('./dist/dataset-english-tweets-train.csv', index=False, encoding='utf-8')
dataset_test.to_csv('./dist/dataset-english-tweets-test.csv', index=False, encoding='utf-8')
dataset_validation.to_csv('./dist/dataset-english-tweets-validation.csv', index=False, encoding='utf-8')

In [46]:
print('Total registros:', dataset.shape[0])
print('Train registros:', dataset_train.shape[0])
print('Test registros:', dataset_test.shape[0])
print('Validation registros:', dataset_validation.shape[0])

Total registros: 51582
Train registros: 41265
Test registros: 5158
Validation registros: 5159


In [47]:
# Distribución clases
print('Train:', dataset_train['sentiment'].value_counts())
print('Test:', dataset_test['sentiment'].value_counts())
print('Validation:', dataset_validation['sentiment'].value_counts())

Train: sentiment
negative    21609
positive    19656
Name: count, dtype: int64
Test: sentiment
negative    2701
positive    2457
Name: count, dtype: int64
Validation: sentiment
negative    2702
positive    2457
Name: count, dtype: int64
