### Pre procesamiento

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import regex as re


import os
def get_hashtags_text(x):
    if pd.notna(x) and 'hashtags' in x and x['hashtags']:
        hashtags = x['hashtags']
        hashtags_text = [ht['text'] for ht in hashtags]
        return hashtags_text
    else:
        return None

def get_urls(x):
    if pd.notna(x) and 'urls' in x and x['urls']:
        return True
    return False


def get_user_location(x):
    if pd.notna(x) and 'location' in x:
        return x['location']
    else:
        return None


def load_json(file_path):
    df_new = pd.DataFrame()

    try:
        # Leer el archivo JSON línea por línea y cargar los datos en una lista
        datos_json = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                if 'retweeted_status' in data and data['retweeted_status'] != None:
                    data = data['retweeted_status']
                datos_json.append(data)

        df = pd.DataFrame(datos_json)

        df_new['id'] = df['id']
        df_new['lang'] = df['lang']
        df_new['text'] = df['text']
        df_new['favorite_count'] = df['favorite_count']
        df_new['retweet_count'] = df['retweet_count']
        df_new['possibly_sensitive'] = df['possibly_sensitive']
        df_new['created_at'] = df['created_at']
        df_new['is_quote_status'] = df['is_quote_status']
        df_new['entities_hashtags_text'] = df['entities'].apply(get_hashtags_text)
        df_new['entities_urls'] = df['entities'].apply(get_urls)
        df_new['user_location'] = df['user'].apply(get_user_location)
        
        # Asignar 0 a las columnas 'reply_count' y 'quote_count' si no existen
        df_new['reply_count'] = df.get('reply_count', 0)
        df_new['quote_count'] = df.get('quote_count', 0)

        # Verificar si hay al menos una imagen en el JSON
        has_image = []
        for i in range(len(df)):
            try:
                extended_entities = df['extended_entities'][i]
                if 'media' in extended_entities and len(extended_entities['media']) > 0:
                    has_image.append(True)
                else:
                    has_image.append(False)
            except (KeyError, TypeError):
                has_image.append(False)

        df_new['has_image'] = has_image

    except Exception as e:
        print(f'Se produjo una excepción: {e}')

    # Retornar el DataFrame resultante
    return df_new


def load_jsons_from_folder(folder_path):
    df_all = pd.DataFrame()

    try:
        # Obtener la lista de archivos en la carpeta
        json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

        for file_name in json_files:
            file_path = os.path.join(folder_path, file_name)
            df = load_json(file_path)

            # Extraer el nombre del archivo sin "_final_data.json"
            json_name = file_name.replace('_final_data.json', '')

            # Agregar columna con el nombre del JSON
            df['json_name'] = json_name

            # Concatenar el DataFrame al DataFrame total
            df_all = pd.concat([df_all, df], ignore_index=True)

    except Exception as e:
        print(f'Se produjo una excepción: {e}')

    # Retornar el DataFrame resultante
    return df_all

def read_tsv_folder(folder_path):
    df_combined = pd.DataFrame()  # DataFrame combinado para almacenar los datos de todos los archivos TSV

    # Obtener la lista de archivos en la carpeta
    file_list = os.listdir(folder_path)

    # Leer cada archivo TSV y combinar los datos en el DataFrame combinado
    for file_name in file_list:
        if file_name.endswith(".tsv"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path, sep="\t")
            df_combined = pd.concat([df_combined, df], ignore_index=True)

    return df_combined


df_json = load_jsons_from_folder("RDATA/CrisisMMD_v2.0/json")
df_tsv = read_tsv_folder("RDATA/CrisisMMD_v2.0/annotations")


df = pd.merge(df_json, df_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
df.drop('tweet_id',axis=1,inplace=True)
df_con_nan = df.copy()
df =  df[df['text_human'].notna()]

import pandas as pd
from datetime import datetime as dt
df_con_nan["fecha"] = pd.to_datetime(df_con_nan['created_at'], format='%a %b %d %H:%M:%S %z %Y')
df["fecha"] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')
#print(a[1000]<a[1])

#df_con_nan
#print(mdates.DateFormatter(df_con_nan['created_at']))
df_con_nan = df_con_nan[df_con_nan['fecha'] >= '2017']


import pytz
from datetime import datetime

def foo(d):
    fecha = d
    fecha_objeto = datetime.strptime(fecha, "%b %d, %Y")
    fecha_transformada = fecha_objeto.strftime("%Y-%m-%d")
    return fecha_transformada

def convertir_fecha(fecha):
    fecha_dt = datetime.strptime(fecha, "%a %b %d %H:%M:%S %z %Y")
    fecha_formato = fecha_dt.strftime("%Y-%m-%d %H:%M:%S")
    return fecha_formato

#print(foo("Oct 10, 2017"))
l = ["Oct 10, 2017", "Aug 26, 2017", "Sep 6, 2017", "Sep 20, 2017", "Nov 13, 2017", "Sep 20, 2017"]
l1 = ["10-10-2017", "26-08-2017", "06-09-2017", "20-09-2017", "13-11-2017", "20-09-2017", "20-05-2017"]

fechas = list(map(foo, l))
#print(fechas)

df_con_nan["tdd_temp"] = 0  # Crear una nueva columna temporal

i = 0
for desastre in df_con_nan['json_name'].unique():
    d = df_con_nan[df_con_nan['json_name'] == desastre]

    AA = d["created_at"]
    BB = pd.to_datetime(l1[i], dayfirst=True, format="%d-%m-%Y").tz_localize(pytz.UTC)
    df_con_nan.loc[df_con_nan['json_name'] == desastre, "tdd_temp"] = (pd.to_datetime(AA, format="%a %b %d %H:%M:%S %z %Y") - BB).dt.days
    #print()
    #print(df_con_nan[df_con_nan["json_name"] == desastre][["tdd_temp", "created_at"]])

    i += 1

df_con_nan["tdd"] = df_con_nan["tdd_temp"]  # Asignar la columna temporal a la columna "tdd"
df_con_nan.drop("tdd_temp", axis=1, inplace=True)  # Eliminar la columna temporal
df_con_nan = df_con_nan[df_con_nan["tdd"] > -5]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(df_con_nan.shape)
print(df.shape)



muertes = {"mexico_earthquake":369,
       "hurricane_harvey":107, #39 desaparecidos
       "hurricane_irma":134, #77 desaparecidos
       "iraq_iran_earthquake":630,
       "hurricane_maria":3059, #35 desaparecidos no sumados
       "srilanka_floods":203,
       "california_wildfires":44} #24 desaparecidos
heridos = {"mexico_earthquake":7631,
       "hurricane_harvey":39,
       "hurricane_irma":77,
       "iraq_iran_earthquake":9388,
       "hurricane_maria":35,
       "srilanka_floods":203,
       "california_wildfires":46}
viviendas ={"mexico_earthquake":150000,
       "hurricane_harvey":246900, #entre varios estados de US
       "hurricane_irma":423554, #sumado entre Cuba, PR y Florida
       "iraq_iran_earthquake":70000,
       "hurricane_maria":77643, # se estima entre este valor y 200,000
       "srilanka_floods":203,
       "california_wildfires":18000}
perdida_economica = {"mexico_earthquake":8000,
       "hurricane_harvey":125000,
       "hurricane_irma":64760,
       "iraq_iran_earthquake":6726,
       "hurricane_maria":90000,
       "srilanka_floods":203,
       "california_wildfires":18000} #en millones de dolares

afectados =  {"mexico_earthquake":12000000,
       "hurricane_harvey":13300000,
       "hurricane_irma":16800000,
       "iraq_iran_earthquake":1800000,
       "hurricane_maria":3300000,
       "srilanka_floods":203,
       "california_wildfires":400000} #no se sabe con exactitud

magnitud = {} #hacer un calculo según los datos arriba

(17544, 21)
(15477, 20)
