# Construcción del dataset

En la carpeta `assets` se encuentran dos archivos CSV que contienen información de tweets en Inglés con su respectiva clasificación de sentimiento. El objetivo de este notebook es construir un dataset que contenga la información de ambos archivos. Se seleccionaran de cada archivo 2000 tweets positivos, 2000 tweets negativos y 2000 neutral. Se traducirán los tweets al español y se almacenarán en un archivo CSV.

In [69]:
# Instalación de librerías
%pip install -r '../requirements.txt'

Collecting scikit-learn==1.5.1 (from -r ../requirements.txt (line 2))
  Downloading scikit_learn-1.5.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn==1.5.1->-r ../requirements.txt (line 2))
  Downloading scipy-1.14.0-cp310-cp310-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn==1.5.1->-r ../requirements.txt (line 2))
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn==1.5.1->-r ../requirements.txt (line 2))
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-macosx_12_0_arm64.whl (11.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m:

In [70]:
# Importación de librerías
import pandas as pd
from sklearn.model_selection import train_test_split

#### Dataset 1

In [71]:
# Lectura dataset 1
header = ['id', 'entity', 'sentiment', 'text']
dataset1_no_procesado = pd.read_csv('./assets/dataset1-no-procesado.csv', delimiter=',', encoding='utf-8', names=header)
# Descrición dataset
print(dataset1_no_procesado.describe())
# Visualización primeros registros
dataset1_no_procesado.head()

                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


Unnamed: 0,id,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [72]:
# Limpieza dataset 1
dataset1_procesado = dataset1_no_procesado.dropna()
dataset1_procesado = dataset1_procesado[dataset1_procesado['text'] != '']
dataset1_procesado = dataset1_procesado[(dataset1_procesado['sentiment'] == 'Positive') | 
                                        (dataset1_procesado['sentiment'] == 'Negative') | 
                                        (dataset1_procesado['sentiment'] == 'Neutral')]
# Descrición dataset
print(dataset1_procesado.describe())

                 id
count  61121.000000
mean    6539.707776
std     3756.407613
min        1.000000
25%     3265.000000
50%     6755.000000
75%     9696.000000
max    13198.000000


In [73]:
# Selección de columnas
dataset1_procesado = dataset1_procesado[['text', 'sentiment']]
dataset1_procesado.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [74]:
# Selección aleatoria de 2000 para cada clase
dataset1_procesado = dataset1_procesado.groupby('sentiment').apply(lambda x: x.sample(n=2000)).reset_index(drop=True)
# Descripción dataset
print(dataset1_procesado['sentiment'].value_counts())
print(dataset1_procesado['text'].head())

sentiment
Negative    2000
Neutral     2000
Positive    2000
Name: count, dtype: int64
0    you care I’m not buying this crap until the PS...
1    I just witnessed John Stockton do a 2 handed 1...
2    @amazonIN and @rblbank i frustrated due to mis...
3    RhandlerR RhandlerR RhandlerR RhandlerR Rhandl...
4    org RhandlerR I swear if this happens one more...
Name: text, dtype: object


  dataset1_procesado = dataset1_procesado.groupby('sentiment').apply(lambda x: x.sample(n=2000)).reset_index(drop=True)


##### Dataset 2


In [75]:
# Lectura dataset 2
dataset2_no_procesado = pd.read_csv('./assets/dataset2-no-procesado.csv', delimiter=',', encoding='unicode_escape')
# Descrición dataset
print(dataset2_no_procesado.describe())
# Visualización primeros registros
dataset2_no_procesado.head()

       Population -2020  Land Area (Km²)  Density (P/Km²)
count      2.748100e+04     2.748100e+04     27481.000000
mean       4.018497e+07     6.621730e+05       357.686583
std        1.504946e+08     1.807425e+06      2013.750702
min        8.010000e+02     0.000000e+00         2.000000
25%        1.968001e+06     2.281000e+04        35.000000
50%        8.655535e+06     1.118900e+05        89.000000
75%        2.843594e+07     5.279700e+05       214.000000
max        1.439324e+09     1.637687e+07     26337.000000


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [76]:
# Limpieza dateset 2
dataset2_procesado = dataset2_no_procesado.dropna()
dataset2_procesado = dataset2_procesado[dataset2_procesado['selected_text'] != '']
dataset2_procesado = dataset2_procesado[(dataset2_procesado['sentiment'] == 'positive') | 
                                        (dataset2_procesado['sentiment'] == 'negative') | 
                                        (dataset2_procesado['sentiment'] == 'neutral')]
# Descrición dataset
print(dataset1_procesado.describe())

        text sentiment
count   6000      6000
unique  5842         3
top           Negative
freq      18      2000


In [77]:
# Selección de columnas
dataset2_procesado = dataset2_procesado[['selected_text', 'sentiment']]
# Renombrar columnas
dataset2_procesado.columns = ['text', 'sentiment']
dataset2_procesado.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative


In [78]:
# Selección aleatoria de 2000 para cada clase
dataset2_procesado = dataset2_procesado.groupby('sentiment').apply(lambda x: x.sample(n=2000)).reset_index(drop=True)
# Descripción dataset
print(dataset2_procesado['sentiment'].value_counts())
print(dataset2_procesado['text'].head())

sentiment
negative    2000
neutral     2000
positive    2000
Name: count, dtype: int64
0                                      spammer
1    Test discovery just missed being in Py3.1
2                                unfortunately
3                                         ****
4                        Not making good time,
Name: text, dtype: object


  dataset2_procesado = dataset2_procesado.groupby('sentiment').apply(lambda x: x.sample(n=2000)).reset_index(drop=True)


# Juntar los dos datasets en un solo archivo CSV

In [79]:
# Unión de datasets
dataset = pd.concat([dataset1_procesado, dataset2_procesado])
# Dejar clases en minúsculas
dataset['sentiment'] = dataset['sentiment'].str.lower()
# Descripción dataset
print(dataset['sentiment'].value_counts())

sentiment
negative    4000
neutral     4000
positive    4000
Name: count, dtype: int64


In [80]:
# Exportación dataset
# 10000 -> train
# 2000 -> test
dataset_train, dataset_test = train_test_split(dataset, test_size=0.2, random_state=42)
dataset_train.to_csv('./dist/dataset-spanish-tweets-train.csv', index=False, encoding='utf-8')
dataset_test.to_csv('./dist/dataset-spanish-tweets-test.csv', index=False, encoding='utf-8')