In [None]:
import os
import re
import pandas as pd

In [241]:
def select_files(files, name="version 3 drought 2_5", months="_6 months"):
    files = [f for f in files if name in f]
    files = [f for f in files if months in f]
    return files

def count_significative_vars(s):
    count_p01  = s.str.contains("*", regex=False).sum()
    count_p005 = s.str.contains("**", regex=False).sum()
    count_p001 = s.str.contains("***", regex=False).sum()
    return count_p01, count_p005, count_p001 

def dict_to_df(dict):
    results_df = pd.DataFrame().from_dict(dict, orient="index", columns=["p=0.1", "p=0.05", "p=0.01"]).reset_index(names="Specification")
    return results_df

def process_significance(s):
    # Replace all values that does not contain an asterisk with NaN and turn them into floats
    s = s.where(s.str.contains("*", regex=False))
    return s

def make_float(s):
    # Remove asterisks and turn into floats
    s = s.replace(r'[^0-9\-.]', "", regex=True)
    return s

In [238]:
import plotly.graph_objects as go

def dot_plot(all_specs, varname):
    fig = go.Figure()

    colors = [
        'rgba(156, 165, 196, 0.95)','rgba(156, 165, 196, 0.95)','rgba(156, 165, 196, 0.95)','rgba(156, 165, 196, 0.95)',
        'rgba(204, 204, 204, 0.95)','rgba(204, 204, 204, 0.95)','rgba(204, 204, 204, 0.95)','rgba(204, 204, 204, 0.95)',
        'rgba(255, 166, 86, 0.95)','rgba(255, 166, 86, 0.95)','rgba(255, 166, 86, 0.95)','rgba(255, 166, 86, 0.95)',
        'rgba(240, 230, 140, 0.95)','rgba(240, 230, 140, 0.95)','rgba(240, 230, 140, 0.95)','rgba(240, 230, 140, 0.95)',
    ]
    data = all_specs[all_specs.variable == varname]
    for i, spec in enumerate(data.specification.unique()):
        data_plot = data[data.specification == spec]
        fig.add_trace(go.Scatter(
            x=data_plot["value"].astype("float"),
            y=data_plot["Country"],
            marker=dict(
                color=colors[i],
                size=16
            ),
            mode='markers',
            name=spec,
        ))

    fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=16))

    fig.update_layout(
        title=varname,
        xaxis=dict(
            showgrid=False,
            showline=True,
            linecolor='rgb(102, 102, 102)',
            tickfont_color='rgb(102, 102, 102)',
            showticklabels=True,
            ticks='outside',
            tickcolor='rgb(102, 102, 102)',
        ),
        margin=dict(l=140, r=40, b=50, t=80),
        legend=dict(
            font_size=10,
            yanchor='top',
            xanchor='center',
        ),
        width=1600,
        height=600,
        paper_bgcolor='white',
        plot_bgcolor='white',
        hovermode='closest',
    )
    
    # Add a vertical line at zero
    fig.update_layout(shapes=[
        dict(
            type="line",
            x0=0,
            y0=0,
            x1=0,
            y1=all_specs.Country.drop_duplicates().shape[0],
            line=dict(
                color="red",
                width=2,
                dash="dashdot",
            )
        )
    ])

    fig.show()

In [252]:
import re
from tqdm import tqdm

folder = r"Z:\Laboral\World Bank\Paper - Child mortality and Climate Shocks\Outputs\countries"
files = os.listdir(folder)
files = [f for f in files if ".tex" in f]
files = select_files(files, name="version 3 drought 3_0", months="_6 months")

all_specs = []
for file in tqdm(files):

    filename = rf"{folder}\{file}"
    country = re.search(r"([A-Z]{3})", file).group(0)
    
    df = pd.read_csv(filename,
                    sep='&',
                    header=0,
                    skiprows=4,
                    skipfooter=3,
                    engine='python')

    keywords = "drought|excessiverain|prec" # Regex allowed

    # Process data
    df = df.set_index(df.columns[0]) # Set variable names in index
    df = df[df.index != ' '] # Remove standard errors
    df = df[df.index.str.contains(keywords)]
    df.index = df.index.str.strip()
    df.index.name = "variable"
    df = df.reset_index()
    results = {}
    df.columns = [re.sub(r'[^0-9a-zA-Z]+', '', col) for col in df.columns]
    
    # Format dataframe
    country_specs = df.melt(id_vars=["variable"], var_name="specification", value_name="value")
    country_specs["value"] = process_significance(country_specs.value)    
    country_specs["value"] = make_float(country_specs.value)    
    country_specs["Country"] = country
    country_specs["variable"] = country_specs["variable"].str.replace(r"\_6\_", " ").str.replace(r"3\_0\_", "3.0 ")
    all_specs += [country_specs]
    
all_specs = pd.concat(all_specs)

dot_plot(all_specs, "drought 3.0 q1")
dot_plot(all_specs, "drought 3.0 q2")
dot_plot(all_specs, "drought 3.0 q3")
dot_plot(all_specs, "drought 3.0 30d")
dot_plot(all_specs, "drought 3.0 30d3m")
dot_plot(all_specs, "drought 3.0 3m6m")
dot_plot(all_specs, "drought 3.0 6m12m")

100%|██████████| 55/55 [00:01<00:00, 30.56it/s]
