In [19]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import regex as re
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [78]:

def get_hashtags_text(x):
    if pd.notna(x) and 'hashtags' in x and x['hashtags']:
        hashtags = x['hashtags']
        hashtags_text = [ht['text'] for ht in hashtags]
        return hashtags_text
    else:
        return None

def get_urls(x):
    if pd.notna(x) and 'urls' in x and x['urls']:
        return True
    return False


def get_user_location(x):
    if pd.notna(x) and 'location' in x:
        return x['location']
    else:
        return None


def load_json(file_path):
    df_new = pd.DataFrame()

    try:
        # Leer el archivo JSON línea por línea y cargar los datos en una lista
        datos_json = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                if 'retweeted_status' in data and data['retweeted_status'] != None:
                    data = data['retweeted_status']
                datos_json.append(data)

        df = pd.DataFrame(datos_json)

        df_new['id'] = df['id']
        df_new['lang'] = df['lang']
        df_new['text'] = df['text']
        df_new['favorite_count'] = df['favorite_count']
        df_new['retweet_count'] = df['retweet_count']
        df_new['possibly_sensitive'] = df['possibly_sensitive']
        df_new['created_at'] = df['created_at']
        df_new['is_quote_status'] = df['is_quote_status']
        df_new['entities_hashtags_text'] = df['entities'].apply(get_hashtags_text)
        df_new['entities_urls'] = df['entities'].apply(get_urls)
        df_new['user_location'] = df['user'].apply(get_user_location)
        
        # Asignar 0 a las columnas 'reply_count' y 'quote_count' si no existen
        df_new['reply_count'] = df.get('reply_count', 0)
        df_new['quote_count'] = df.get('quote_count', 0)

        # Verificar si hay al menos una imagen en el JSON
        has_image = []
        for i in range(len(df)):
            try:
                extended_entities = df['extended_entities'][i]
                if 'media' in extended_entities and len(extended_entities['media']) > 0:
                    has_image.append(True)
                else:
                    has_image.append(False)
            except (KeyError, TypeError):
                has_image.append(False)

        df_new['has_image'] = has_image

    except Exception as e:
        print(f'Se produjo una excepción: {e}')

    # Retornar el DataFrame resultante
    return df_new

def get_magnitud(df):
    #print(mexico.shape)
    weights = [0.2,0.2,0.3,0.05,0,0.05,0.05,0.15]

    human= pd.DataFrame(data=np.zeros(8),
                        columns=["proportion"],
                        index=["affected_individuals",
                                "infrastructure_and_utility_damage",
                                "injured_or_dead_people",
                                "missing_or_found_people",
                                "not_humanitarian",
                                "other_relevant_information",
                                "rescue_volunteering_or_donation_effort",
                                "vehicle_damage"]
                        )

    #print(human)
    a = df["text_human"].value_counts(normalize=True)
    a = a.to_frame().add(human,fill_value=0)

    #print(a)

    return round(np.average(a=a["proportion"],weights=weights),3)

### Magnitud Mexico

In [96]:
mexico_json = load_json("RDATA/CrisisMMD_v2.0/json/mexico_earthquake_final_data.json")
mexico_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/mexico_earthquake_final_data.tsv")

mexico = pd.merge(mexico_json, mexico_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
mexico.drop('tweet_id',axis=1,inplace=True)
mexico =  mexico[mexico['text_human'].notna()]
mexico = mexico.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(mexico)
print(magnitud)


0.073


### Magnitud iraq Iran

In [97]:
iraq_json = load_json("RDATA/CrisisMMD_v2.0/json/iraq_iran_earthquake_final_data.json")
iraq_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/iraq_iran_earthquake_final_data.tsv")

iraq = pd.merge(iraq_json, iraq_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
iraq.drop('tweet_id',axis=1,inplace=True)
iraq =  iraq[iraq['text_human'].notna()]
iraq = iraq.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(iraq)
print(magnitud)

0.107


### Magnitud California

In [98]:
california_json = load_json("RDATA/CrisisMMD_v2.0/json/california_wildfires_final_data.json")
california_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/california_wildfires_final_data.tsv")

california = pd.merge(california_json, california_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
california.drop('tweet_id',axis=1,inplace=True)
california =  california[california['text_human'].notna()]
california = california.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(california)
print(magnitud)

0.078


### Magnitud Harvey

In [99]:
harvey_json = load_json("RDATA/CrisisMMD_v2.0/json/hurricane_harvey_final_data.json")
harvey_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/hurricane_harvey_final_data.tsv")

harvey = pd.merge(harvey_json, harvey_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
harvey.drop('tweet_id',axis=1,inplace=True)
harvey =  harvey[harvey['text_human'].notna()]
harvey = harvey.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(harvey)
print(magnitud)

0.06


### Magnitud Irma

In [100]:
irma_json = load_json("RDATA/CrisisMMD_v2.0/json/hurricane_irma_final_data.json")
irma_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/hurricane_irma_final_data.tsv")

irma = pd.merge(irma_json, irma_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
irma.drop('tweet_id',axis=1,inplace=True)
irma =  irma[irma['text_human'].notna()]
irma = irma.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(irma)
print(magnitud)

0.06


### Magnitud Maria

In [101]:
maria_json = load_json("RDATA/CrisisMMD_v2.0/json/hurricane_maria_final_data.json")
maria_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/hurricane_maria_final_data.tsv")

maria = pd.merge(maria_json, maria_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
maria.drop('tweet_id',axis=1,inplace=True)
maria =  maria[maria['text_human'].notna()]
maria = maria.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(maria)
print(magnitud)

0.046


### Magnitud Srilanka

In [102]:
srilanka_json = load_json("RDATA/CrisisMMD_v2.0/json/srilanka_floods_final_data.json")
srilanka_tsv = pd.read_table("RDATA/CrisisMMD_v2.0/annotations/srilanka_floods_final_data.tsv")

srilanka = pd.merge(srilanka_json, srilanka_tsv[['tweet_id', 'text_info', 'text_info_conf', 'text_human', 'text_human_conf']],
                    left_on='id', right_on='tweet_id', how='left')
srilanka.drop('tweet_id',axis=1,inplace=True)
srilanka =  srilanka[srilanka['text_human'].notna()]
srilanka = srilanka.drop_duplicates(subset=['id']).reset_index(drop=True)
magnitud = get_magnitud(srilanka)
print(magnitud)

0.03
