In [None]:
pd.read_csv('tweet.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Tweet Preprocessing

## Librerías & Data Load

In [None]:
import pandas as pd
import numpy as np
import re
from html import unescape

df = pd.read_csv('/content/drive/MyDrive/tfm_DataScience/dataset/disaster_dataset.csv', encoding='utf-8')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74346 entries, 0 to 74345
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   InformationType        74346 non-null  object
 1   event                  74346 non-null  object
 2   TweetID                73613 non-null  object
 3   TweetText              74346 non-null  object
 4   location               74338 non-null  object
 5   year                   74346 non-null  object
 6   Informativeness_label  74346 non-null  object
dtypes: object(7)
memory usage: 4.0+ MB


## Análisis exploratorio (EDA)

### InformationType

In [None]:
# Tweets no ASCII
df[[not text.isascii() for text in df.TweetText]].InformationType.value_counts()

Unnamed: 0_level_0,count
InformationType,Unnamed: 1_level_1
not related or not informative,5612
other useful information,4332
donations and volunteering,2462
affected individuals,1676
sympathy and support,1247
infrastructure and utilities damage,994
caution and advice,655


In [None]:
# Tweets exclusivamente ASCII
df[[text.isascii() for text in df.TweetText]].InformationType.value_counts()

Unnamed: 0_level_0,count
InformationType,Unnamed: 1_level_1
not related or not informative,20173
other useful information,14545
donations and volunteering,6463
affected individuals,6333
sympathy and support,3773
infrastructure and utilities damage,3565
caution and advice,2516


La categoría de *not related or not informative* sigue siendo la más grande, pero las categorías con contenido relevante también muestran una mayor presencia en los tweets puramente ASCII.

### Informativeness_label

In [None]:
# Tweets no ASCII
df[[not text.isascii() for text in df.TweetText]].Informativeness_label.value_counts()

Unnamed: 0_level_0,count
Informativeness_label,Unnamed: 1_level_1
informative,10119
not related or not informative,6859


In [None]:
# Tweets exclusivamente ASCII
df[[text.isascii() for text in df.TweetText]].Informativeness_label.value_counts()

Unnamed: 0_level_0,count
Informativeness_label,Unnamed: 1_level_1
informative,33422
not related or not informative,23946


Aunque la diferencia es pequeña, los tweets no ASCII presentan una proporción ligeramente mayor de mensajes informativos.

### Event

In [None]:
# Tweets no ASCII
df[[not text.isascii() for text in df.TweetText]].event.value_counts()

Unnamed: 0_level_0,count
event,Unnamed: 1_level_1
hurricane,7139
earthquake,4741
floods,1812
landslides,939
traffic crash,646
wildfires,646
terrorism,349
building collapse,277
haze,175
explosion,121


In [None]:
# Tweets exclusivamente ASCII
df[[text.isascii() for text in df.TweetText]].event.value_counts()

Unnamed: 0_level_0,count
event,Unnamed: 1_level_1
hurricane,23721
earthquake,15799
floods,6870
wildfires,2974
traffic crash,1739
landslides,1659
terrorism,1628
meteor,828
explosion,786
building collapse,668


**Hurricane** y **earthquake** dominan en ambos grupos (ASCII y no ASCII). Aunque los tweets exclusivamente ASCII presentan mayor volumen absoluto.

## Preprocesado

In [None]:
# Columna limpia
df['ProcessedText'] = df['TweetText'].copy()

# Eliminación de URLs, menciones y retweets
df['ProcessedText'] = df['ProcessedText'].str.replace(r'http\S+', '', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'(RT|rt)[ ]*@[\S]+', '', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'@\S+', '', regex=True)

# Eliminación de caracteres no ASCII
df['ProcessedText'] = df['ProcessedText'].apply(lambda x: ''.join([i if ord(i) < 128 else '' for i in x]))

# Eliminación de guiones bajos y otros caracteres
df['ProcessedText'] = df['ProcessedText'].str.replace(r'_[\S]?', '', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'[ ]{2,}', ' ', regex=True)
# Sustitución de entidades HTML
df['ProcessedText'] = df['ProcessedText'].str.replace(r'&amp;', 'and', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'&lt;', '<', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'&gt;', '>', regex=True)
# Separación de caracteres y palabras
df['ProcessedText'] = df['ProcessedText'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2', regex=True)
# Eliminación de palabras repetidas y hashtags
df['ProcessedText'] = df['ProcessedText'].str.replace(r'\b(\w+)(\1{2,})\b', r'\1', regex=True)
df['ProcessedText'] = df['ProcessedText'].str.replace(r'#\S+', '', regex=True)
#Eliminación de palabras con números
df['ProcessedText'] = df['ProcessedText'].str.replace(r'\b\w{2,}\d+\w*\b', '', regex=True)  # Elimina palabras mixtas con números

# Conversión a minúsculas y eliminación de espacios
df['ProcessedText'] = df['ProcessedText'].str.lower().str.strip()

# Normalización final de espacios
df['ProcessedText'] = df['ProcessedText'].str.replace(r'[ ]{2,}', ' ', regex=True)

# Cálculo de la longitud del texto procesado
df['ProcessedText_length'] = df['ProcessedText'].apply(lambda x: len(x.split()))

# Distribución
length_counts = df['ProcessedText_length'].value_counts()
print(length_counts)


ProcessedText_length
13    4087
14    4048
12    3978
15    3954
16    3798
17    3742
11    3704
18    3698
20    3584
19    3569
10    3428
21    3395
22    3246
9     3111
23    2894
8     2738
24    2349
7     2181
25    2119
26    1701
6     1648
27    1386
5     1262
28    1072
4      921
29     782
30     600
31     417
32     292
33     217
34     128
35      97
36      62
3       40
37      34
38      16
39      13
42      11
40       6
2        4
44       3
41       3
47       3
48       2
43       2
45       1
Name: count, dtype: int64


In [None]:
print(df['ProcessedText'].head(10))

0                happy saint patric ' s daaaaaaaaaaaay
2    " 8 quote and rt this for free follows a # kca...
3    as organizations and first responders take adv...
4    ps 63 : 3 - 4 ur love lord s better than life ...
5    i miss u so much after busy trip in us now ur ...
6    its a humble request plz tel your fan # prayfo...
7    hope my dad is going to be ok because the typh...
8    condolences go out 2 u guys . we all r wid u g...
9    i wnt 2 knw abt my frnd , mahek agarwal . she ...
Name: ProcessedText, dtype: object


## Data Save

In [None]:
ruta_nuevo_dataset = '/content/disaster_preprocessed.csv'

df.to_csv(ruta_nuevo_dataset, index=False)