In [142]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pycountry
from countryinfo import CountryInfo as CInfo # Se conecta con la informacion de cada país
import matplotlib.pyplot as plt
import  altair  as  alt # Graficos de promedios
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [143]:
import warnings
warnings.filterwarnings("ignore")

In [144]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)
data

Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter
0,2066,HTML,markup,AE,2020,1
1,1627,CSS,markup,AE,2020,1
2,288,Jupyter Notebook,markup,AE,2020,1
3,108,Vue,markup,AE,2020,1
4,1734,JavaScript,programming,AE,2020,1
...,...,...,...,...,...,...
112203,223,TypeScript,programming,ZW,2024,1
112204,186,PHP,programming,ZW,2024,1
112205,179,Dockerfile,programming,ZW,2024,1
112206,136,Java,programming,ZW,2024,1


In [145]:
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

In [146]:
nan_rows_count = data.isna().any(axis=1).sum()
print(f"There are {nan_rows_count} rows with NaN values in the dataset.")

There are 0 rows with NaN values in the dataset.


In [147]:
# Seleccionamos los 10 lenguajes mas usados en Data Science
top_program_lang = programming_languages = [
    "Python", "Java",
    "C++", "C","JavaScript", "C#",
    "Ruby", "Go", "PHP", "TypeScript"
]

In [148]:
data2 = data[data['language'].isin(top_program_lang)]
data2 = data2.reset_index(drop=True)
data2

Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter
0,1734,JavaScript,programming,AE,2020,1
1,816,Python,programming,AE,2020,1
2,685,Java,programming,AE,2020,1
3,377,Ruby,programming,AE,2020,1
4,354,PHP,programming,AE,2020,1
...,...,...,...,...,...,...
18692,412,Python,programming,ZW,2024,1
18693,223,TypeScript,programming,ZW,2024,1
18694,186,PHP,programming,ZW,2024,1
18695,136,Java,programming,ZW,2024,1


In [149]:
data2['year_quarter'] = data2['year'].astype(str) + '-Q' + data2['quarter'].astype(str)
data2
# Reseteamos el index
data2 = data2.reset_index(drop=True)
data2

Unnamed: 0,num_pushers,language,language_type,iso2_code,year,quarter,year_quarter
0,1734,JavaScript,programming,AE,2020,1,2020-Q1
1,816,Python,programming,AE,2020,1,2020-Q1
2,685,Java,programming,AE,2020,1,2020-Q1
3,377,Ruby,programming,AE,2020,1,2020-Q1
4,354,PHP,programming,AE,2020,1,2020-Q1
...,...,...,...,...,...,...,...
18692,412,Python,programming,ZW,2024,1,2024-Q1
18693,223,TypeScript,programming,ZW,2024,1,2024-Q1
18694,186,PHP,programming,ZW,2024,1,2024-Q1
18695,136,Java,programming,ZW,2024,1,2024-Q1


In [150]:
# Creating a unique identifier
data2['unique_id'] = data2['iso2_code'] + '-' + data2['language']

In [151]:
# Create a DataFrame of unique identifiers
iso2_code = pd.DataFrame({'iso2_code': data2['iso2_code'].unique()})

# Create a DataFrame of unique identifiers for languages 
language = pd.DataFrame({'language': data2['language'].unique()})

# Create a DataFrame of all time periods
# time_periods = pd.DataFrame({'year_quarter': range(data2['year_quarter'].min(), data2['year_quarter'].max() + 1)})
year_quarter = pd.DataFrame({'year_quarter': data2['year_quarter'].unique()})

# Create the Cartesian product of unique_ids and time_periods
balanced_panel = iso2_code.merge(language, how='cross').merge(year_quarter, how='cross')

balanced_panel["unique_id"] = balanced_panel["iso2_code"] + "-" +balanced_panel["language"]
balanced_panel

# # Merge the balanced panel with the original data
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left')

# # Merge the DataFrames with suffixes
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left', suffixes=('', '_y'))

# # Now, drop the columns with '_y' suffix, which are from the right DataFrame
balanced_df = balanced_df.loc[:, ~balanced_df.columns.str.endswith('_y')]
balanced_df 

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter
0,AE,JavaScript,2020-Q1,AE-JavaScript,1734.0,programming,2020.0,1.0
1,AE,JavaScript,2020-Q2,AE-JavaScript,2299.0,programming,2020.0,2.0
2,AE,JavaScript,2020-Q3,AE-JavaScript,2366.0,programming,2020.0,3.0
3,AE,JavaScript,2020-Q4,AE-JavaScript,2468.0,programming,2020.0,4.0
4,AE,JavaScript,2021-Q1,AE-JavaScript,2544.0,programming,2021.0,1.0
...,...,...,...,...,...,...,...,...
27365,LS,Go,2023-Q1,LS-Go,,,,
27366,LS,Go,2023-Q2,LS-Go,,,,
27367,LS,Go,2023-Q3,LS-Go,,,,
27368,LS,Go,2023-Q4,LS-Go,,,,


In [152]:
# Function to convert quarter format to integer
def quarter_to_int(quarter_string):
    year, q = quarter_string.split('-')
    year = int(year)
    quarter_number = int(q[1])  # Q1, Q2, Q3, Q4 -> 1, 2, 3, 4
    base_year = 2020  # Adjust based on your balanced_df, or set dynamically
    return 4 * (year - base_year) + quarter_number

# Applying the function
balanced_df['quarter'] = balanced_df['year_quarter'].apply(quarter_to_int)
balanced_df['year'] = balanced_df['year_quarter'].str.split('-').str[0]
balanced_df.loc[balanced_df["num_pushers"].isnull(), "num_pushers"] = 0
balanced_df

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter
0,AE,JavaScript,2020-Q1,AE-JavaScript,1734.0,programming,2020,1
1,AE,JavaScript,2020-Q2,AE-JavaScript,2299.0,programming,2020,2
2,AE,JavaScript,2020-Q3,AE-JavaScript,2366.0,programming,2020,3
3,AE,JavaScript,2020-Q4,AE-JavaScript,2468.0,programming,2020,4
4,AE,JavaScript,2021-Q1,AE-JavaScript,2544.0,programming,2021,5
...,...,...,...,...,...,...,...,...
27365,LS,Go,2023-Q1,LS-Go,0.0,,2023,13
27366,LS,Go,2023-Q2,LS-Go,0.0,,2023,14
27367,LS,Go,2023-Q3,LS-Go,0.0,,2023,15
27368,LS,Go,2023-Q4,LS-Go,0.0,,2023,16


## Per capita measures

In [153]:
# Definimos una función que convierte los nombres de países en códigos iso2
def country_to_iso2(country_name):
    try:
        # Intentamos obtener el código iso2 del país usando pycountry
        return pycountry.countries.get(name=country_name).alpha_2
    except AttributeError:
        try:
            # Manejamos casos especiales donde el nombre del país no coincide exactamente con la base de datos de pycountry
            special_cases = {
                "Czechia (Czech Republic)": "CZ",
                "Congo (Congo-Brazzaville)": "CG",
                "Holy See": "VA",
                "Timor-Leste (East Timor)": "TL",
                "Ukraine (with certain exceptions)": "UA",
                "Taiwan": "TW",
                "Bolivia": "BO",
                "Tanzania": "TZ",
                "South Korea": "KR",
                "Moldova": "MD",
                "Brunei": "BN"
            }
            return special_cases[country_name]
        except KeyError:
            return None

# Creamos una lista de países y obtenemos sus códigos iso2 usando la función country_to_iso2
gpt_countries_list = [
    "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria",
    "Azerbaijan", "Bahamas", "Bangladesh", "Barbados", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia",
    "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Cabo Verde", "Canada",
    "Chile", "Colombia", "Comoros", "Congo (Congo-Brazzaville)", "Costa Rica", "Côte d'Ivoire", "Croatia", "Cyprus",
    "Czechia", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "El Salvador", "Estonia", "Fiji",
    "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea",
    "Guinea-Bissau", "Guyana", "Haiti", "Holy See", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iraq",
    "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait",
    "Kyrgyzstan", "Latvia", "Lebanon", "Lesotho", "Liberia", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar",
    "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico",
    "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
    "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway",
    "Oman", "Pakistan", "Palau", "Palestine, State of", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
    "Poland", "Portugal", "Qatar", "Romania", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia",
    "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal",
    "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "South Africa",
    "South Korea", "Spain", "Sri Lanka", "Suriname", "Sweden", "Switzerland", "Taiwan", "Tanzania", "Thailand",
    "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Tuvalu", "Uganda", "Ukraine",
    "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Vanuatu", "Zambia"
]

gpt_countries_iso = [country_to_iso2(country) for country in gpt_countries_list]

# Agregamos una nueva columna gpt_available, con valor 1 si el código iso2 está en la lista gpt_countries_iso, y 0 en caso contrario
balanced_df["gpt_available"] = balanced_df["iso2_code"].apply(lambda row: 1 if row in gpt_countries_iso else 0)

In [154]:
countries = balanced_df.iso2_code.unique()

def create_populations_dictionary():
    country_populations = {}
    special_cases = {"MM": 54688774, "PS": 5483450, "ME": 602445, "AD":79824}
    # special_cases = {}
    for country in countries:
        try:
            country_populations.update({country: CInfo(country).info()["population"]})
        except KeyError:
            try:
                fallback_name = pycountry.countries.lookup(country).name
                country_populations.update({country: CInfo(fallback_name).info()["population"]})
            except KeyError:
                print(country)
                country_populations.update({country: special_cases[country]})

    return country_populations

country_populations = create_populations_dictionary()

ME
MM
PS
AD


In [155]:
# Create population
balanced_df["population"] = balanced_df["iso2_code"].map(country_populations)

# Create pushes_pc
balanced_df["num_pushers_pc"] = (balanced_df["num_pushers"] / balanced_df["population"])*100000

# **Figura 1:**  Pushers per 100k 

BANGLADESH

In [244]:
# balanced_df[balanced_df['iso2_code'] == 'IN']

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter,gpt_available,population,num_pushers_pc
8840,IN,JavaScript,2020-Q1,IN-JavaScript,110247.0,programming,2020,1,1,1263930000,8.722556
8841,IN,JavaScript,2020-Q2,IN-JavaScript,157459.0,programming,2020,2,1,1263930000,12.457889
8842,IN,JavaScript,2020-Q3,IN-JavaScript,184385.0,programming,2020,3,1,1263930000,14.588229
8843,IN,JavaScript,2020-Q4,IN-JavaScript,190204.0,programming,2020,4,1,1263930000,15.048618
8844,IN,JavaScript,2021-Q1,IN-JavaScript,185333.0,programming,2021,5,1,1263930000,14.663233
...,...,...,...,...,...,...,...,...,...,...,...
9005,IN,Go,2023-Q1,IN-Go,12655.0,programming,2023,13,1,1263930000,1.001242
9006,IN,Go,2023-Q2,IN-Go,13082.0,programming,2023,14,1,1263930000,1.035026
9007,IN,Go,2023-Q3,IN-Go,13742.0,programming,2023,15,1,1263930000,1.087244
9008,IN,Go,2023-Q4,IN-Go,19206.0,programming,2023,16,1,1263930000,1.519546


In [189]:
df_filtered_BD = balanced_df[balanced_df['iso2_code'] == 'BD']

In [190]:
df_filtered_BD

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter,gpt_available,population,num_pushers_pc
1530,BD,JavaScript,2020-Q1,BD-JavaScript,7039.0,programming,2020,1,1,157486000,4.469604
1531,BD,JavaScript,2020-Q2,BD-JavaScript,8393.0,programming,2020,2,1,157486000,5.329363
1532,BD,JavaScript,2020-Q3,BD-JavaScript,11229.0,programming,2020,3,1,157486000,7.130158
1533,BD,JavaScript,2020-Q4,BD-JavaScript,11331.0,programming,2020,4,1,157486000,7.194925
1534,BD,JavaScript,2021-Q1,BD-JavaScript,14077.0,programming,2021,5,1,157486000,8.938572
...,...,...,...,...,...,...,...,...,...,...,...
1695,BD,Go,2023-Q1,BD-Go,605.0,programming,2023,13,1,157486000,0.384161
1696,BD,Go,2023-Q2,BD-Go,571.0,programming,2023,14,1,157486000,0.362572
1697,BD,Go,2023-Q3,BD-Go,590.0,programming,2023,15,1,157486000,0.374636
1698,BD,Go,2023-Q4,BD-Go,627.0,programming,2023,16,1,157486000,0.398131


In [191]:
# Aumentar el límite máximo de filas
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')

# Suponiendo que 'data' es tu DataFrame
data_aggregated_BD = df_filtered_BD.groupby('language').size().reset_index(name='counts')
data_aggregated_BD['percentage'] = (data_aggregated_BD['counts'] / data_aggregated_BD['counts'].sum()) * 100

In [192]:
# Convertir 'num_pushers' a numérico si es necesario
df_filtered_BD['num_pushers_pc'] = pd.to_numeric(df_filtered_BD['num_pushers_pc'], errors='coerce')

# Agregar datos por 'year_quarter' y 'language'
data_aggregated_BD = df_filtered_BD.groupby(['year_quarter', 'language'], as_index=False).agg(
    total_pushers=('num_pushers_pc', 'sum')
).rename(columns={'total_pushers': 'num_pushers_pc'})

In [193]:
alt.Chart(data_aggregated_BD).mark_bar()

In [207]:
# Crear el gráfico principal
line_chart = alt.Chart(data_aggregated_BD, height=450, width=600).mark_line(point=True, color='darkblue').encode(
    y=alt.Y('num_pushers_pc', title='Number of pushers per 100K'),
    x=alt.X('year_quarter', title='Quarter'),
    color=alt.Color('language', title='Programming languages'),
    tooltip=['num_pushers_pc', 'year_quarter', 'language']
).interactive().properties(
    title='Number of pushers per 100k by language programming - Bangladesh'
)


# Crear el DataFrame con varias líneas de texto
text_data = pd.DataFrame({
    'text': [
        'Number of periods: 16',
        'Number of observations: 160',
        'Number of programming languages: 10',
        'Source: GIT Innovation Graph'
    ],
    'y': [20, 35, 50, 65]  # Ajusta estos valores para la posición vertical de cada línea
})


# Crear el texto de la fuente y las estadísticas
source_text = alt.Chart(text_data).mark_text(
    align='right',
    baseline='bottom',
    dx=-5,  # Ajusta este valor para mover el texto a la izquierda o derecha
    fontSize=12,
    color='black'
).encode(
    x=alt.value(640),  # posición x para alinear a la derecha (ajusta según sea necesario)
    y=alt.Y('y:Q', axis=None),
    text='text:N'
).properties(
    width=650,
    height=80  # Ajusta el alto para acomodar todas las líneas de texto
)

# Combinar el gráfico principal con el texto de la fuente y las estadísticas
final_chart = alt.vconcat(
    line_chart,
    source_text
).configure_concat(
    spacing=5
).configure_view(
    strokeWidth=0
)

final_chart.display()

SRI LANKA

In [197]:
df_filtered_LK = balanced_df[balanced_df['iso2_code'] == 'LK']
df_filtered_LK

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter,gpt_available,population,num_pushers_pc
11560,LK,JavaScript,2020-Q1,LK-JavaScript,3764.0,programming,2020,1,1,20277597,18.562357
11561,LK,JavaScript,2020-Q2,LK-JavaScript,5041.0,programming,2020,2,1,20277597,24.859948
11562,LK,JavaScript,2020-Q3,LK-JavaScript,4722.0,programming,2020,3,1,20277597,23.286783
11563,LK,JavaScript,2020-Q4,LK-JavaScript,5865.0,programming,2020,4,1,20277597,28.923546
11564,LK,JavaScript,2021-Q1,LK-JavaScript,4977.0,programming,2021,5,1,20277597,24.544328
...,...,...,...,...,...,...,...,...,...,...,...
11725,LK,Go,2023-Q1,LK-Go,270.0,programming,2023,13,1,20277597,1.331519
11726,LK,Go,2023-Q2,LK-Go,332.0,programming,2023,14,1,20277597,1.637275
11727,LK,Go,2023-Q3,LK-Go,298.0,programming,2023,15,1,20277597,1.469602
11728,LK,Go,2023-Q4,LK-Go,506.0,programming,2023,16,1,20277597,2.495365


In [198]:
# Aumentar el límite máximo de filas
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')

# Suponiendo que 'data' es tu DataFrame
data_aggregated_LK = df_filtered_LK.groupby('language').size().reset_index(name='counts')
data_aggregated_LK['percentage'] = (data_aggregated_LK['counts'] / data_aggregated_LK['counts'].sum()) * 100

In [199]:
# Convertir 'num_pushers' a numérico si es necesario
df_filtered_LK['num_pushers_pc'] = pd.to_numeric(df_filtered_LK['num_pushers_pc'], errors='coerce')

# Agregar datos por 'year_quarter' y 'language'
data_aggregated_LK = df_filtered_LK.groupby(['year_quarter', 'language'], as_index=False).agg(
    total_pushers=('num_pushers_pc', 'sum')
).rename(columns={'total_pushers': 'num_pushers_pc'})

In [200]:
alt.Chart(data_aggregated_LK).mark_bar()

In [206]:
# Crear el gráfico principal
line_chart = alt.Chart(data_aggregated_LK, height=450, width=600).mark_line(point=True, color='darkblue').encode(
    y=alt.Y('num_pushers_pc', title='Number of pushers per 100K'),
    x=alt.X('year_quarter', title='Quarter'),
    color=alt.Color('language', title='Programming languages'),
    tooltip=['num_pushers_pc', 'year_quarter', 'language']
).interactive().properties(
    title='Number of pushers per 100k by language programming - Sri Lanka'
)


# Crear el DataFrame con varias líneas de texto
text_data = pd.DataFrame({
    'text': [
        'Number of periods: 16',
        'Number of observations: 160',
        'Number of programming languages: 10',
        'Source: GIT Innovation Graph'
    ],
    'y': [20, 35, 50, 65]  # Ajusta estos valores para la posición vertical de cada línea
})


# Crear el texto de la fuente y las estadísticas
source_text = alt.Chart(text_data).mark_text(
    align='right',
    baseline='bottom',
    dx=-5,  # Ajusta este valor para mover el texto a la izquierda o derecha
    fontSize=12,
    color='black'
).encode(
    x=alt.value(640),  # posición x para alinear a la derecha (ajusta según sea necesario)
    y=alt.Y('y:Q', axis=None),
    text='text:N'
).properties(
    width=650,
    height=80  # Ajusta el alto para acomodar todas las líneas de texto
)

# Combinar el gráfico principal con el texto de la fuente y las estadísticas
final_chart = alt.vconcat(
    line_chart,
    source_text
).configure_concat(
    spacing=5
).configure_view(
    strokeWidth=0
)

final_chart.display()

INDIA

In [202]:
df_filtered_IN = balanced_df[balanced_df['iso2_code'] == 'IN']
df_filtered_IN

Unnamed: 0,iso2_code,language,year_quarter,unique_id,num_pushers,language_type,year,quarter,gpt_available,population,num_pushers_pc
8840,IN,JavaScript,2020-Q1,IN-JavaScript,110247.0,programming,2020,1,1,1263930000,8.722556
8841,IN,JavaScript,2020-Q2,IN-JavaScript,157459.0,programming,2020,2,1,1263930000,12.457889
8842,IN,JavaScript,2020-Q3,IN-JavaScript,184385.0,programming,2020,3,1,1263930000,14.588229
8843,IN,JavaScript,2020-Q4,IN-JavaScript,190204.0,programming,2020,4,1,1263930000,15.048618
8844,IN,JavaScript,2021-Q1,IN-JavaScript,185333.0,programming,2021,5,1,1263930000,14.663233
...,...,...,...,...,...,...,...,...,...,...,...
9005,IN,Go,2023-Q1,IN-Go,12655.0,programming,2023,13,1,1263930000,1.001242
9006,IN,Go,2023-Q2,IN-Go,13082.0,programming,2023,14,1,1263930000,1.035026
9007,IN,Go,2023-Q3,IN-Go,13742.0,programming,2023,15,1,1263930000,1.087244
9008,IN,Go,2023-Q4,IN-Go,19206.0,programming,2023,16,1,1263930000,1.519546


In [211]:
# Aumentar el límite máximo de filas
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')

# Suponiendo que 'data' es tu DataFrame
data_aggregated_IN = df_filtered_IN.groupby('language').size().reset_index(name='counts')
data_aggregated_IN['percentage'] = (data_aggregated_IN['counts'] / data_aggregated_IN['counts'].sum()) * 100

In [214]:
# Convertir 'num_pushers' a numérico si es necesario
df_filtered_IN['num_pushers_pc'] = pd.to_numeric(df_filtered_IN['num_pushers_pc'], errors='coerce')

# Agregar datos por 'year_quarter' y 'language'
data_aggregated_IN = df_filtered_IN.groupby(['year_quarter', 'language'], as_index=False).agg(
    total_pushers=('num_pushers_pc', 'sum')
).rename(columns={'total_pushers': 'num_pushers_pc'})

In [215]:
alt.Chart(data_aggregated_IN).mark_bar()

In [217]:
# Crear el gráfico principal
line_chart = alt.Chart(data_aggregated_IN, height=450, width=600).mark_line(point=True, color='darkblue').encode(
    y=alt.Y('num_pushers_pc', title='Number of pushers per 100K'),
    x=alt.X('year_quarter', title='Quarter'),
    color=alt.Color('language', title='Programming languages'),
    tooltip=['num_pushers_pc', 'year_quarter', 'language']
).interactive().properties(
    title='Number of pushers per 100k by language programming - India'
)


# Crear el DataFrame con varias líneas de texto
text_data = pd.DataFrame({
    'text': [
        'Number of periods: 16',
        'Number of observations: 160',
        'Number of programming languages: 10',
        'Source: GIT Innovation Graph'
    ],
    'y': [20, 35, 50, 65]  # Ajusta estos valores para la posición vertical de cada línea
})


# Crear el texto de la fuente y las estadísticas
source_text = alt.Chart(text_data).mark_text(
    align='right',
    baseline='bottom',
    dx=-5,  # Ajusta este valor para mover el texto a la izquierda o derecha
    fontSize=12,
    color='black'
).encode(
    x=alt.value(640),  # posición x para alinear a la derecha (ajusta según sea necesario)
    y=alt.Y('y:Q', axis=None),
    text='text:N'
).properties(
    width=650,
    height=80  # Ajusta el alto para acomodar todas las líneas de texto
)

# Combinar el gráfico principal con el texto de la fuente y las estadísticas
final_chart = alt.vconcat(
    line_chart,
    source_text
).configure_concat(
    spacing=5
).configure_view(
    strokeWidth=0
)

final_chart.display()

# **Figura 2:**  Repositories per 100k

In [218]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/repositories.csv"
repos = pd.read_csv(url, delimiter=',', keep_default_na=False)
repos

Unnamed: 0,repositories,iso2_code,year,quarter
0,30669781,US,2020,1
1,20179087,CN,2020,1
2,17912888,EU,2020,1
3,9254535,IN,2020,1
4,4171705,GB,2020,1
...,...,...,...,...
3786,183,SH,2024,1
3787,180,TO,2024,1
3788,136,KI,2024,1
3789,129,PM,2024,1


In [219]:
repos_update = repos[repos['iso2_code'].isin(['BD', 'LK', 'IN'])]
repos_update = repos_update.reset_index(drop=True)
repos_update

Unnamed: 0,repositories,iso2_code,year,quarter
0,9254535,IN,2020,1
1,588881,BD,2020,1
2,257723,LK,2020,1
3,10503365,IN,2020,2
4,656174,BD,2020,2
5,285947,LK,2020,2
6,11823607,IN,2020,3
7,753764,BD,2020,3
8,311354,LK,2020,3
9,13198250,IN,2020,4


In [220]:
# Create population
repos_update["population"] = repos_update["iso2_code"].map(country_populations)

# Create pushes_pc
repos_update["repositories_pc"] = (repos_update["repositories"] / repos_update["population"])*100000

#create year-quarter
repos_update['year_quarter'] = repos_update['year'].astype(str) + '-Q' + repos_update['quarter'].astype(str)

repos_update

Unnamed: 0,repositories,iso2_code,year,quarter,population,repositories_pc,year_quarter
0,9254535,IN,2020,1,1263930000,732.203128,2020-Q1
1,588881,BD,2020,1,157486000,373.925936,2020-Q1
2,257723,LK,2020,1,20277597,1270.974071,2020-Q1
3,10503365,IN,2020,2,1263930000,831.008442,2020-Q2
4,656174,BD,2020,2,157486000,416.655449,2020-Q2
5,285947,LK,2020,2,20277597,1410.162161,2020-Q2
6,11823607,IN,2020,3,1263930000,935.463752,2020-Q3
7,753764,BD,2020,3,157486000,478.622862,2020-Q3
8,311354,LK,2020,3,20277597,1535.458072,2020-Q3
9,13198250,IN,2020,4,1263930000,1044.223177,2020-Q4


In [221]:
alt.Chart(repos_update).mark_bar()

In [222]:
alt.Chart(repos_update, height=600, width=750).mark_line(point=True, color='darkblue').encode(
    y='repositories_pc',
    x='year_quarter',
    color='iso2_code',
    tooltip=['repositories_pc', 'year_quarter', 'iso2_code']
).interactive().properties(
    title='Number of Repositories per 100k by country 2020-2023'
)


# **Figura 3:** Developers per 100k

In [223]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/developers.csv"
dev = pd.read_csv(url, delimiter=',', keep_default_na=False)
dev

Unnamed: 0,developers,iso2_code,year,quarter
0,9763122,US,2020,1
1,6983489,EU,2020,1
2,5885036,CN,2020,1
3,4194257,IN,2020,1
4,1635516,GB,2020,1
...,...,...,...,...
3822,201,MH,2024,1
3823,197,KI,2024,1
3824,158,PM,2024,1
3825,152,VA,2024,1


In [224]:
dev = dev[dev.iso2_code != "EU"]

In [225]:
dev_update = dev[dev['iso2_code'].isin(['BD', 'LK', 'IN'])]
dev_update = dev_update.reset_index(drop=True)
dev_update

Unnamed: 0,developers,iso2_code,year,quarter
0,4194257,IN,2020,1
1,215789,BD,2020,1
2,84292,LK,2020,1
3,4684375,IN,2020,2
4,246934,BD,2020,2
5,95516,LK,2020,2
6,5206926,IN,2020,3
7,279159,BD,2020,3
8,104093,LK,2020,3
9,5653924,IN,2020,4


In [226]:
# Create population
dev_update["population"] = dev_update["iso2_code"].map(country_populations)

# Create pushes_pc
dev_update["developers_pc"] = (dev_update["developers"] / dev_update["population"])*100000

#create year-quarter
dev_update['year_quarter'] = dev_update['year'].astype(str) + '-Q' + dev_update['quarter'].astype(str)

dev_update

Unnamed: 0,developers,iso2_code,year,quarter,population,developers_pc,year_quarter
0,4194257,IN,2020,1,1263930000,331.842507,2020-Q1
1,215789,BD,2020,1,157486000,137.021069,2020-Q1
2,84292,LK,2020,1,20277597,415.690281,2020-Q1
3,4684375,IN,2020,2,1263930000,370.619813,2020-Q2
4,246934,BD,2020,2,157486000,156.79743,2020-Q2
5,95516,LK,2020,2,20277597,471.042008,2020-Q2
6,5206926,IN,2020,3,1263930000,411.963163,2020-Q3
7,279159,BD,2020,3,157486000,177.259566,2020-Q3
8,104093,LK,2020,3,20277597,513.339919,2020-Q3
9,5653924,IN,2020,4,1263930000,447.328887,2020-Q4


In [227]:
alt.Chart(dev_update).mark_bar()

In [228]:
alt.Chart(dev_update, height=600, width=750).mark_line(point=True, color='darkblue').encode(
    y='developers_pc',
    x='year_quarter',
    color='iso2_code',
    tooltip=['developers_pc', 'year_quarter', 'iso2_code']
).interactive().properties(
    title='Number of Developers per 100k by country 2020-2023'
)

# **Figura 4:**  Organizations per 100k

In [229]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/organizations.csv"
org = pd.read_csv(url, delimiter=',', keep_default_na=False)
org

Unnamed: 0,organizations,iso2_code,year,quarter
0,601565,US,2020,1
1,433848,EU,2020,1
2,377119,CN,2020,1
3,247204,IN,2020,1
4,125302,BR,2020,1
...,...,...,...,...
3039,138,FO,2024,1
3040,134,DJ,2024,1
3041,128,GU,2024,1
3042,119,MC,2024,1


In [230]:
org = org[org.iso2_code != "EU"]

In [232]:
org_update = org[org['iso2_code'].isin(['BD', 'LK', 'IN'])]
org_update = org_update.reset_index(drop=True)
org_update

Unnamed: 0,organizations,iso2_code,year,quarter
0,247204,IN,2020,1
1,26487,BD,2020,1
2,7205,LK,2020,1
3,268762,IN,2020,2
4,27532,BD,2020,2
5,8023,LK,2020,2
6,287840,IN,2020,3
7,28421,BD,2020,3
8,8514,LK,2020,3
9,301887,IN,2020,4


In [233]:
# Create population
org_update["population"] = org_update["iso2_code"].map(country_populations)

# Create organizations_pc
org_update["organizations_pc"] = (org_update["organizations"] / org_update["population"])*100000

#create year-quarter
org_update['year_quarter'] = org_update['year'].astype(str) + '-Q' + org_update['quarter'].astype(str)

org_update

Unnamed: 0,organizations,iso2_code,year,quarter,population,organizations_pc,year_quarter
0,247204,IN,2020,1,1263930000,19.558362,2020-Q1
1,26487,BD,2020,1,157486000,16.818638,2020-Q1
2,7205,LK,2020,1,20277597,35.531824,2020-Q1
3,268762,IN,2020,2,1263930000,21.263994,2020-Q2
4,27532,BD,2020,2,157486000,17.482189,2020-Q2
5,8023,LK,2020,2,20277597,39.565832,2020-Q2
6,287840,IN,2020,3,1263930000,22.773413,2020-Q3
7,28421,BD,2020,3,157486000,18.046684,2020-Q3
8,8514,LK,2020,3,20277597,41.987224,2020-Q3
9,301887,IN,2020,4,1263930000,23.884788,2020-Q4


In [234]:
alt.Chart(org_update).mark_bar()

In [235]:
alt.Chart(org_update, height=600, width=750).mark_line(point=True, color='darkblue').encode(
    y='organizations_pc',
    x='year_quarter',
    color='iso2_code',
    tooltip=['organizations_pc', 'year_quarter', 'iso2_code']
).interactive().properties(
    title='Number of Organizations per 100k by country 2020-2023'
)

**Figura 5:**  Pushes per 100k

In [236]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/git_pushes.csv"
pushes = pd.read_csv(url, delimiter=',', keep_default_na=False)
pushes

Unnamed: 0,git_pushes,iso2_code,year,quarter
0,21377653,US,2020,1
1,17175696,EU,2020,1
2,5567938,CN,2020,1
3,4422587,IN,2020,1
4,3980488,GB,2020,1
...,...,...,...,...
2886,1701,BS,2024,1
2887,1640,KY,2024,1
2888,1383,PG,2024,1
2889,953,TD,2024,1


In [237]:
pushes = pushes[pushes.iso2_code != "EU"]

In [238]:
# Muestra cada código ISO2 y el total de organizaciones asociados a ese código.
git_grouped = pushes.groupby('iso2_code')['git_pushes'].sum().reset_index()
git_sorted = git_grouped.sort_values(by='git_pushes', ascending=False)
print(git_sorted.head(10))

    iso2_code  git_pushes
171        US   399163917
74         IN   184351590
21         BR    90370062
41         DE    81361453
55         GB    79737101
86         KR    77635105
142        RU    70566025
53         FR    64502037
34         CN    63111106
27         CA    59638610


In [239]:
git_update = pushes[pushes['iso2_code'].isin(['BD', 'LK', 'IN'])]
git_update = git_update.reset_index(drop=True)
git_update

Unnamed: 0,git_pushes,iso2_code,year,quarter
0,4422587,IN,2020,1
1,295222,BD,2020,1
2,144885,LK,2020,1
3,7346825,IN,2020,2
4,390126,BD,2020,2
5,224575,LK,2020,2
6,7832236,IN,2020,3
7,506514,BD,2020,3
8,194379,LK,2020,3
9,7291401,IN,2020,4


In [240]:
# Create population
git_update["population"] = git_update["iso2_code"].map(country_populations)

# Create organizations_pc
git_update["git_pushes_pc"] = (git_update["git_pushes"] / git_update["population"])*100000

#create year-quarter
git_update['year_quarter'] = git_update['year'].astype(str) + '-Q' + git_update['quarter'].astype(str)

git_update

Unnamed: 0,git_pushes,iso2_code,year,quarter,population,git_pushes_pc,year_quarter
0,4422587,IN,2020,1,1263930000,349.90759,2020-Q1
1,295222,BD,2020,1,157486000,187.459203,2020-Q1
2,144885,LK,2020,1,20277597,714.50774,2020-Q1
3,7346825,IN,2020,2,1263930000,581.268346,2020-Q2
4,390126,BD,2020,2,157486000,247.721067,2020-Q2
5,224575,LK,2020,2,20277597,1107.503024,2020-Q2
6,7832236,IN,2020,3,1263930000,619.673241,2020-Q3
7,506514,BD,2020,3,157486000,321.624779,2020-Q3
8,194379,LK,2020,3,20277597,958.589916,2020-Q3
9,7291401,IN,2020,4,1263930000,576.883293,2020-Q4


In [241]:
alt.Chart(git_update).mark_bar()

In [242]:
alt.Chart(git_update, height=600, width=750).mark_line(point=True, color='darkblue').encode(
    y='git_pushes_pc',
    x='year_quarter',
    color='iso2_code',
    tooltip=['git_pushes_pc', 'year_quarter', 'iso2_code']
).interactive().properties(
    title='Number of Pushes per 100k by country 2020-2023'
)