# 1. Libraries & Settings

In [None]:
import numpy as np
import pandas as pd
import os
import json

pd.set_option('display.max_colwidth', 220)

In [None]:
datasets_info = {
    2021: "../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv",
    2020: "../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv",
    2019: "../input/kaggle-survey-2019/multiple_choice_responses.csv",
    2018: "../input/kaggle-survey-2018/multipleChoiceResponses.csv"
}

questions_info = {
    'single questions': [
        ("Duration",
             ["Duration (in seconds)"]),
        ("Age",
             ["What is your age"]),
        ("Gender",
             ["What is your gender"]),
        ("Country",
             ["In which country do you currently reside"]),
        ("Education",
             ["What is the highest level of formal education that you have attained or plan to attain within the next 2 years"]),
        ("Current role",
             ["Select the title most similar to your current role (or most recent title if retired)"]),
        ("Writing code",
             ["For how many years have you been writing code and/or programming",  # [2020-2021]
              "How long have you been writing code to analyze data"]),             # [2018-2019]
        ("Programming language",
             ["What programming language would you recommend an aspiring data scientist to learn first"]),    
        ("Computing platform",
             ["What type of computing platform do you use most often for your data science projects"]),
        ("Used a TPU",
             ["Approximately how many times have you used a TPU (tensor processing unit)"]),
        ("Used ML methods",
             ["For how many years have you used machine learning methods"]),
        ("In what industry",
             ["In what industry is your current employer/contract (or your most recent employer if retired)"]),
        ("Size of the company",
             ["What is the size of the company where you are employed"]),
        ("Individuals are responsible",
             ["Approximately how many individuals are responsible for data science workloads at your place of business"]),    
        ("ML methods into business",
             ["Does your current employer incorporate machine learning methods into their business"]),
        ("Compensation, USD",
             ["What is your current yearly compensation (approximate $USD)"]),
        ("Spent money,  USD",
             ["Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)"]),
        ("Cloud platforms",
             ["Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)"]),
        ("Following big data products",
             ["Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often"]),
        ("Following business intelligence tools",
             ["Which of the following business intelligence tools do you use most often"]),
        ("Primary tool",
             ["What is the primary tool that you use at work or school to analyze data"])
    ],
    'group questions': [
        ("Programming languages use on a regular basis",
             ["What programming languages do you use on a regular basis"]),
        ("IDE's use on a regular basis",
             ["Which of the following integrated development environments (IDE's) do you use on a regular basis",
              "Which of the following integrated development environments (IDE's) have you used at work or school in the last 5 years"]),  # 2018
        ("Hosted notebook products use on a regular basis",
             ["Which of the following hosted notebook products do you use on a regular basis",
              "Which of the following hosted notebooks have you used at work or school in the last 5 years"]),  # 2018
        ("Specialized hardware use on a regular basis",
             ["Which types of specialized hardware do you use on a regular basis"]),
        ("Visualization libraries or tools use on a regular basis",
             ["What data visualization libraries or tools do you use on a regular basis",
              "What data visualization libraries or tools have you used in the past 5 years"]),
        ("ML frameworks use on a regular basis",
             ["Which of the following machine learning frameworks do you use on a regular basis",
              "What machine learning frameworks have you used in the past 5 years"]),
        ("ML algorithms use on a regular basis",
             ["Which of the following ML algorithms do you use on a regular basis"]),
        ("CV methods use on a regular basis",
             ["Which categories of computer vision methods do you use on a regular basis"]),
        ("NLP methods use on a regular basis",
             ["Which of the following natural language processing (NLP) methods do you use on a regular basis"]),
        ("Important part of your role at work",
             ["Select any activities that make up an important part of your role at work"]),
        ("Cloud computing platforms use on a regular basis",
             ["Which of the following cloud computing platforms do you use on a regular basis"]),
        ("Cloud computing platforms to become more familiar",
             ["Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years"]),
        ("Cloud computing products use on a regular basis",
             ["Do you use any of the following cloud computing products on a regular basis"]),
        ("Cloud computing products to become more familiar",
             ["In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products"]),
        ("Big data products use on a regular basis",
             ["Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis"]),
        ("Big data products to become more familiar", 
             ["Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years"]),
        ("BI tools use on a regular basis",
             ["Which of the following business intelligence tools do you use on a regular basis"]),
        ("BI tools to become more familiar",
             ["Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years"]),
        ("AutoML use on a regular basis",
             ["Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis"]),
        ("AutoML to become more familiar",
             ["Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years"]),
        ("AutoML use on a regular basis --2",
             ["Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis"]),
        ("Specific AutoML tools to become more familiar",
             ["Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years"]),
        ("Use any tools to help manage machine learning experiments",
             ["Do you use any tools to help manage machine learning experiments"]),
        ("In the next 2 years, do you hope to become more familiar",
             ["In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments"]),
        ("Where publicly share",
             ["Where do you publicly share your data analysis or machine learning applications",
              "Where do you publicly share or deploy your data analysis or machine learning applications"]),
        ("Platforms or data science courses",
             ["On which platforms have you begun or completed data science courses"]),
        ("Favorite media sources",
             ["Who/what are your favorite media sources that report on data science topics"])
    ]
}

# 2. Loading data

In [None]:
datasets = {}

for year_data, path_to_data in datasets_info.items():
    datasets[year_data] = pd.read_csv(path_to_data, low_memory=False)

In [None]:
for x_year in datasets.keys():
    print(x_year, ":", datasets.get(x_year).shape)

In [None]:
# For check and dataset description
latest_data = max(datasets.keys())

# 'single columns' / 'multi questions'
columns_info = {}

# 3. Main changes

In [None]:
def replace_cols_name(cols_diff, group_num=None):
    if group_num or group_num == 0:
        new_indices = ["GA" + str(group_num) + "_" + str(x)
                       for x in cols_diff.index.to_list()
                           if isinstance(x, int)]
    else:
        new_indices = ["SA" + str(x)
                       for x in cols_diff.index.to_list()
                           if isinstance(x, int)]

    if new_indices:
        cols_diff.index = new_indices

    for x_year in datasets.keys():
        if x_year in cols_diff.columns:
            replace_dict = pd.DataFrame(cols_diff[x_year].index,
                                        cols_diff[x_year].values).to_dict()[0]

            datasets.get(x_year).rename(columns=replace_dict,
                                       inplace=True)

## 3.1. Single columns

In [None]:
def single_cols_diff(questions):
    result = {year_data: [] for year_data in datasets.keys()}
    
    def search_col_name(df, questions_list):
        for col in df.columns:
            for question in questions_list:
                if question in df.loc[0, col]:
                    return df[col].name

    for question_info in questions:
        short_name, versions = question_info

        for x_year in result.keys():
            data_header = datasets.get(x_year).head(1)

            col_name = search_col_name(data_header, versions)
            result[x_year].append(col_name)
    
    result = pd.DataFrame(result)

    result_question = [versions[0]
                       for short_name, versions in questions]
    result['question'] = pd.Series(result_question)
    
    return result

In [None]:
single_cols = single_cols_diff(questions_info.get('single questions'))
columns_info['single columns'] = single_cols.to_dict()

single_cols.loc[1:]

### Check result

In [None]:
check_year = 2018
check_cols = "Q12_MULTIPLE_CHOICE"
print(datasets.get(check_year).loc[0, check_cols])
datasets.get(check_year).loc[1: , check_cols].unique()

In [None]:
check_year = 2019
check_cols = "Q14"
print(datasets.get(check_year).loc[0, check_cols])
datasets.get(check_year).loc[1: , check_cols].unique()

In [None]:
check_year = 2021
check_cols = "Q41"
print(datasets.get(check_year).loc[0, check_cols])
datasets.get(check_year).loc[1: , check_cols].unique()

### Replace column names

In [None]:
replace_cols_name(single_cols)

datasets.get(latest_data).head(3)

## 3.2. Multi columns

In [None]:
def group_cols_diff(questions):
    qroup_cols_marker = "_"
    txt_choice_marker = "- Selected Choice -"

    concat_data = []

    for x_year in datasets.keys():
        first_row = datasets.get(x_year).head(1)
        qroup_cols = first_row.filter(like=qroup_cols_marker)

        def get_group_values(data, questions):
            data = data.T #.filter(like=questions_marker)
            is_choice = data[0].str.contains(txt_choice_marker)
            data = data[is_choice]
            
            for question in questions:
                result = data.loc[data[0].str.contains(question, regex=False), 0]

                if result.any():
                    df = result.str.split(txt_choice_marker, expand=True)[1] \
                                            .str.strip().reset_index().set_index(1)
                    df = df.rename(columns={'index': x_year})
                    return df

        group_values = get_group_values(qroup_cols, questions)
        concat_data.append(group_values)
    
    result = pd.concat(concat_data, axis=1)
    result['choise'] = result.index
    result = result.reset_index(drop=True)
    
    return result.replace({np.nan: None})


def group_result(indx):
    _, versions = group_questions[indx]
    print("\n({}): {}\n".format(indx, versions[0]))
    return pd.DataFrame(columns_info.get('group columns')[indx])


def group_replace(col_info, base_info, x_year, is_test=False):
    result = False
    col_name, col_value = col_info

    if col_name in datasets.get(x_year).columns:
        base_col_name, base_col_value = base_info
        old_value = col_value
        new_value = base_col_value
        
        unique_list = datasets.get(x_year)[col_name].unique()
        notna_sum =  datasets.get(x_year)[col_name].notna().sum()
        
        if new_value not in unique_list:
            datasets.get(x_year)[col_name].replace(regex=[old_value],
                                                   value=new_value,
                                                       inplace=True)            
            result = True

        if is_test == True:
            print(x_year, col_name)
            print("Replace:", result)
            print("Values ({}) before: {}\n".format(notna_sum, unique_list))
            print("Values ({}) after: {}\n".format(
                            datasets.get(x_year)[col_name].notna().sum(),
                                datasets.get(x_year)[col_name].unique()))
        
    return result


def group_update(base_info, upd_cols_info, x_year, is_test=False):
    base_col_name, base_col_value = base_info

    if base_col_name in datasets.get(x_year).columns:
        for upd_col_info in upd_cols_info:
            col_name, col_value = upd_col_info
            isna_sum = datasets.get(x_year)[base_col_name].isna().sum()

            result = False

            # 1. Replace and get status
            status = group_replace(upd_col_info, base_info, x_year)

            if status == True:
                # 2. Fillna base_col_name <<< col_name
                datasets.get(x_year)[base_col_name].fillna(
                                    datasets.get(x_year)[col_name], inplace=True)
                # 3. Drop col_name
                datasets.get(x_year).drop([col_name], axis=1, inplace=True)

                result = True

            if is_test:
                print(x_year, base_col_name, "<<<", col_name)
                print("Update:", result)
                print("IsNa before:", isna_sum)
                print("IsNa after:", datasets.get(x_year)[base_col_name].isna().sum())


In [None]:
group_questions = questions_info.get('group questions')

group_result_dict =  {i: group_cols_diff(check_question[1])
                      for i, check_question in enumerate(group_questions)}

columns_info['group columns'] = {col_indx: col_diff.to_dict()
                                 for col_indx, col_diff in group_result_dict.items()}

In [None]:
len(group_result_dict)

In [None]:
locals().update({'group_{}'.format(x): x for x in group_result_dict.keys()})
print(group_0, "...", group_26)

### === Show group ===

In [None]:
group_result(group_0)

### Replace group

In [None]:
x_year = 2018

# Javascript
base_info = ('Q16_Part_6', 'Javascript')
col_info = ('Q16_Part_6', 'Javascript/Typescript')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q16_Part_18', 'Other')
upd_cols_info = [
    ('Q16_Part_7', 'Visual Basic/VBA'),
    ('Q16_Part_10', 'Scala'),
    ('Q16_Part_11', 'Julia'),
    ('Q16_Part_12', 'Go'),
    ('Q16_Part_13', 'C#/.NET'),
    ('Q16_Part_14', 'PHP'),
    ('Q16_Part_15', 'Ruby'),
    ('Q16_Part_16', 'SAS/STATA')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

# C/C++
base_info = ('Q18_Part_4', 'C/C++')
col_info = ('Q18_Part_4', 'C$')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q18_Part_5', 'C\+\+')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q18_Part_12', 'Other')
upd_cols_info = [
    ('Q18_Part_8', 'TypeScript')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# C/C++
base_info = ('Q7_Part_4', 'C/C++')
col_info = ('Q7_Part_4', 'C$')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q7_Part_5', 'C\+\+')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q7_OTHER', 'Other')
upd_cols_info = [
    ('Q7_Part_8', 'Julia'),
    ('Q7_Part_9', 'Swift')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# C/C++
base_info = ('Q7_Part_4', 'C/C++')
col_info = ('Q7_Part_4', 'C$')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q7_Part_5', 'C\+\+')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q7_OTHER', 'Other')
upd_cols_info = [
    ('Q7_Part_8', 'Julia'),
    ('Q7_Part_9', 'Swift')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_0][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_0)

### === Show group ===

In [None]:
group_result(group_1)

### Replace group

In [None]:
x_year = 2018

# Jupyter (JupyterLab, Jupyter Notebooks, etc)
base_info = ('Q13_Part_1', 'Jupyter (JupyterLab, Jupyter Notebooks, etc)')
col_info = ('Q13_Part_1', 'Jupyter/IPython')
_ = group_replace(col_info, base_info, x_year)

# Visual Studio (Visual Studio Code)
base_info = ('Q13_Part_8', 'Visual Studio (Visual Studio Code)')
col_info = ('Q13_Part_8', 'Visual Studio')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q13_Part_4', 'Visual Studio Code')
]
group_update(base_info, upd_cols_info, x_year)

# Vim / Emacs
base_info = ('Q13_Part_11', 'Vim / Emacs')
col_info = ('Q13_Part_11', 'Vim')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q13_Part_15', 'Other')
upd_cols_info = [
    ('Q13_Part_6', 'Atom'),
    ('Q13_Part_5', 'nteract'),
    ('Q13_Part_12', 'IntelliJ')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

# Visual Studio (Visual Studio Code)
base_info = ('Q16_Part_6', 'Visual Studio (Visual Studio Code)')
col_info = ('Q16_Part_6', 'Visual Studio / Visual Studio Code')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q16_Part_12', 'Other')
upd_cols_info = [
    ('Q16_Part_4', 'Atom')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# Visual Studio (Visual Studio Code)
base_info = ('Q9_Part_3', 'Visual Studio (Visual Studio Code)')
col_info = ('Q9_Part_3', '^Visual Studio$|Visual Studio \/ Visual Studio Code')
_ = group_replace(col_info, base_info, x_year)

upd_cols_info = [
    ('Q9_Part_4', 'Visual Studio Code \(VSCode\)')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021
# Jupyter (JupyterLab, Jupyter Notebooks, etc)
base_info = ('Q9_Part_1', 'Jupyter (JupyterLab, Jupyter Notebooks, etc)')
upd_cols_info = [
    ('Q9_Part_11', 'Jupyter Notebook')
]
group_update(base_info, upd_cols_info, x_year)

# Visual Studio / Visual Studio Code
base_info = ('Q9_Part_3', 'Visual Studio (Visual Studio Code)')
col_info = ('Q9_Part_3', 'Visual Studio')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q9_Part_4', 'Visual Studio Code \(VSCode\)')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_1][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_1)

### === Show group ===

In [None]:
group_result(group_2)

### Replace group

In [None]:
x_year = 2018

# Kaggle Notebooks
base_info = ('Q14_Part_1', 'Kaggle Notebooks')
col_info = ('Q14_Part_1', 'Kaggle Kernels')
_ = group_replace(col_info, base_info, x_year)

# Azure Notebooks
base_info = ('Q14_Part_3', 'Azure Notebooks')
col_info = ('Q14_Part_3', 'Azure Notebook')
_ = group_replace(col_info, base_info, x_year)

# Google Colab Notebooks
base_info = ('Q14_Part_2', 'Google Colab Notebooks')
col_info = ('Q14_Part_2', 'Google Colab')
_ = group_replace(col_info, base_info, x_year)

# Google Cloud (AI Platform, Datalab, etc)
base_info = ('Q14_Part_5', 'Google Cloud (AI Platform, Datalab, etc)')
col_info = ('Q14_Part_5', 'Google Cloud Datalab')
_ = group_replace(col_info, base_info, x_year)

# Binder / JupyterHub
base_info = ('Q14_Part_9', 'Binder / JupyterHub')
col_info = ('Q14_Part_9', 'JupyterHub/Binder')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q14_Part_11', 'Other')
upd_cols_info = [
    ('Q14_Part_4', 'Domino Datalab'),
    ('Q14_Part_6', 'Paperspace'),
    ('Q14_Part_7', 'Floydhub'),
    ('Q14_Part_8', 'Crestle')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

# Kaggle Notebooks
base_info = ('Q17_Part_1', 'Kaggle Notebooks')
col_info = ('Q17_Part_1', 'Kaggle Notebooks \(Kernels\)')
_ = group_replace(col_info, base_info, x_year)

# Azure Notebooks
base_info = ('Q17_Part_3', 'Azure Notebooks')
col_info = ('Q17_Part_3', 'Microsoft Azure Notebooks')
_ = group_replace(col_info, base_info, x_year)

# Google Colab Notebooks
base_info = ('Q17_Part_2', 'Google Colab Notebooks')
col_info = ('Q17_Part_2', 'Google Colab')
_ = group_replace(col_info, base_info, x_year)

# Google Cloud (AI Platform, Datalab, etc)
base_info = ('Q17_Part_4', 'Google Cloud (AI Platform, Datalab, etc)')
col_info = ('Q17_Part_4', 'Google Cloud Notebook Products \(AI Platform, Datalab, etc\)')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q17_Part_12', 'Other')
upd_cols_info = [
    ('Q17_Part_5', 'Paperspace / Gradient'),
    ('Q17_Part_8', 'IBM Watson Studio'),
    ('Q17_Part_9', 'Code Ocean'),
    ('Q17_Part_6', 'FloydHub'),
    ('Q17_Part_10', 'AWS Notebook Products \(EMR Notebooks, Sagemaker Notebooks, etc\)')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# Google Colab Notebooks
base_info = ('Q10_Part_2', 'Google Colab Notebooks')
col_info = ('Q10_Part_2', 'Colab Notebooks')
_ = group_replace(col_info, base_info, x_year)

# Google Cloud (AI Platform, Datalab, etc)
base_info = ('Q10_Part_10', 'Google Cloud (AI Platform, Datalab, etc)')
col_info = ('Q10_Part_10', 'Google Cloud AI Platform Notebooks')
_ = group_replace(col_info, base_info, x_year)

upd_cols_info = [
    ('Q10_Part_11', 'Google Cloud Datalab Notebooks')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q10_OTHER', 'Other')
upd_cols_info = [
    ('Q10_Part_4', 'Paperspace / Gradient'),
    ('Q10_Part_6', 'Code Ocean'),
    ('Q10_Part_7', 'IBM Watson Studio'),
    ('Q10_Part_8', 'Amazon Sagemaker Studio'),
    ('Q10_Part_9', 'Amazon EMR Notebooks'),
    ('Q10_Part_12', 'Databricks Collaborative Notebooks')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Google Colab Notebooks
base_info = ('Q10_Part_2', 'Google Colab Notebooks')
col_info = ('Q10_Part_2', 'Colab Notebooks')
_ = group_replace(col_info, base_info, x_year)

# Google Cloud (AI Platform, Datalab, etc)
base_info = ('Q10_Part_10', 'Google Cloud (AI Platform, Datalab, etc)')
col_info = ('Q10_Part_10', 'Google Cloud Notebooks \(AI Platform / Vertex AI\)')
_ = group_replace(col_info, base_info, x_year)

upd_cols_info = [
    ('Q10_Part_11', 'Google Cloud Datalab')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q10_OTHER', 'Other')
upd_cols_info = [
    ('Q10_Part_4', 'Paperspace / Gradient'),
    ('Q10_Part_6', 'Code Ocean'),
    ('Q10_Part_7', 'IBM Watson Studio'),
    ('Q10_Part_8', 'Amazon Sagemaker Studio Notebooks'),
    ('Q10_Part_9', 'Amazon EMR Notebooks'),
    ('Q10_Part_12', 'Databricks Collaborative Notebooks'),
    ('Q10_Part_13', 'Zeppelin / Zepl Notebooks'),
    ('Q10_Part_14', 'Deepnote Notebooks'),
    ('Q10_Part_15', 'Observable Notebooks')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_2][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_2)

### === Show group ===

In [None]:
group_result(group_3)

### Replace group

In [None]:
x_year = 2019

# CPUs >>> 2019 year: 10472
datasets.get(x_year).drop(['Q21_Part_1'], axis=1, inplace=True)

# None
base_info = ('Q21_Part_4', 'None')
col_info = ('Q21_Part_4', 'None / I do not know')
_ = group_replace(col_info, base_info, x_year)

In [None]:
x_year = 2021

# GPUs
base_info = ('Q12_Part_1', 'GPUs')
col_info = ('Q12_Part_1', 'NVIDIA GPUs')
_ = group_replace(col_info, base_info, x_year)

# TPUs
base_info = ('Q12_Part_2', 'TPUs')
col_info = ('Q12_Part_2', 'Google Cloud TPUs')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q12_OTHER', 'Other')
upd_cols_info = [
    ('Q12_Part_3', 'AWS Trainium Chips'),
    ('Q12_Part_4', 'AWS Inferentia Chips')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_3][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_3)

### === Show group ===

In [None]:
group_result(group_4)

### Replace group

In [None]:
x_year = 2018

# Ggplot / ggplot2
base_info = ('Q21_Part_1', 'Ggplot / ggplot2')
col_info = ('Q21_Part_1', 'ggplot2')
_ = group_replace(col_info, base_info, x_year)

# Plotly / Plotly Express
base_info = ('Q21_Part_6', 'Plotly / Plotly Express')
col_info = ('Q21_Part_6', 'Plotly')
_ = group_replace(col_info, base_info, x_year)

# D3 js
base_info = ('Q21_Part_5', 'D3 js')
col_info = ('Q21_Part_5', 'D3')
_ = group_replace(col_info, base_info, x_year)

# Leaflet / Folium
base_info = ('Q21_Part_10', 'Leaflet / Folium')
col_info = ('Q21_Part_10', 'Leaflet')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q21_Part_13', 'Other')
upd_cols_info = [
    ('Q21_Part_11', 'Lattice')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

# D3 js
base_info = ('Q20_Part_5', 'D3 js')
col_info = ('Q20_Part_5', 'D3\.js')
_ = group_replace(col_info, base_info, x_year)


### Check group

In [None]:
_ = group_cols_diff(group_questions[group_4][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_4)

### === Show group ===

In [None]:
group_result(group_5)

### Replace group

In [None]:
x_year = 2018

# Scikit-learn
base_info = ('Q19_Part_1', 'Scikit-learn')
col_info = ('Q19_Part_1', 'Scikit-Learn')
_ = group_replace(col_info, base_info, x_year)

# Fast.ai
base_info = ('Q19_Part_7', 'Fast.ai')
col_info = ('Q19_Part_7', 'Fastai')
_ = group_replace(col_info, base_info, x_year)

# LightGBM
base_info = ('Q19_Part_14', 'LightGBM')
col_info = ('Q19_Part_14', 'lightgbm')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q19_Part_19', 'Other')
upd_cols_info = [
    ('Q19_Part_5', 'Spark MLlib'),
    ('Q19_Part_6', 'H20'),
    ('Q19_Part_8', 'Mxnet'),
    ('Q19_Part_11', 'mlr'),
    ('Q19_Part_12', 'Prophet'),
    ('Q19_Part_13', 'randomForest'),
    ('Q19_Part_15', 'catboost'),
    ('Q19_Part_16', 'CNTK'),
    ('Q19_Part_17', 'Caffe')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

# Other
base_info = ('Q28_Part_12', 'Other')
upd_cols_info = [
    ('Q28_Part_4', 'RandomForest'),
    ('Q28_Part_9', 'Spark MLib')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# Other
base_info = ('Q16_OTHER', 'Other')
upd_cols_info = [
    ('Q16_Part_6', 'MXNet'),
    ('Q16_Part_9', 'CatBoost'),
    ('Q16_Part_10', 'Prophet'),
    ('Q16_Part_11', 'H2O 3'),
    ('Q16_Part_13', 'Tidymodels'),
    ('Q16_Part_14', 'JAX')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# PyTorch
base_info = ('Q16_Part_4', 'PyTorch')
upd_cols_info = [
    ('Q16_Part_15', 'PyTorch Lightning')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q16_OTHER', 'Other')
upd_cols_info = [
    ('Q16_Part_6', 'MXNet'),
    ('Q16_Part_9', 'CatBoost'),
    ('Q16_Part_10', 'Prophet'),
    ('Q16_Part_11', 'H2O 3'),
    ('Q16_Part_13', 'Tidymodels'),
    ('Q16_Part_14', 'JAX'),
    ('Q16_Part_16', 'Huggingface')    
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_5][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_5)

### === Show group ===

In [None]:
group_result(group_6)

### Replace group

In [None]:
x_year = 2019

# Transformer Networks
base_info = ('Q24_Part_10', 'Transformer Networks (BERT, gpt, etc)')
col_info = ('Q24_Part_10', 'Transformer Networks \(BERT, gpt-2, etc\)')
_ = group_replace(col_info, base_info, x_year)

In [None]:
x_year = 2020

# Transformer Networks
base_info = ('Q17_Part_10', 'Transformer Networks (BERT, gpt, etc)')
col_info = ('Q17_Part_10', 'Transformer Networks \(BERT, gpt-3, etc\)')
_ = group_replace(col_info, base_info, x_year)

In [None]:
x_year = 2021

# Transformer Networks
base_info = ('Q17_Part_10', 'Transformer Networks (BERT, gpt, etc)')
col_info = ('Q17_Part_10', 'Transformer Networks \(BERT, gpt-3, etc\)')
_ = group_replace(col_info, base_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_6][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_6)

### === Show group ===

In [None]:
group_result(group_7)

### Replace column names

In [None]:
_ = group_cols_diff(group_questions[group_7][1])
replace_cols_name(_, group_7)

### === Show group ===

In [None]:
group_result(group_8)

### Replace group

In [None]:
# Transformer language models

x_year = 2019
base_info = ('Q27_Part_4', 'Transformer language models (GPT, BERT, XLnet, etc)')
col_info = ('Q27_Part_4', 'Transformer language models \(GPT-2, BERT, XLnet, etc\)')
_ = group_replace(col_info, base_info, x_year)

x_year = 2020
base_info = ('Q19_Part_4', 'Transformer language models (GPT, BERT, XLnet, etc)')
col_info = ('Q19_Part_4', 'Transformer language models \(GPT-3, BERT, XLnet, etc\)')
_ = group_replace(col_info, base_info, x_year)

x_year = 2021
base_info = ('Q19_Part_4', 'Transformer language models (GPT, BERT, XLnet, etc)')
col_info = ('Q19_Part_4', 'Transformer language models \(GPT-3, BERT, XLnet, etc\)')
_ = group_replace(col_info, base_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_8][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_8)

### === Show group ===

In [None]:
group_result(group_9)

### Replace column names

In [None]:
_ = group_cols_diff(group_questions[group_9][1])
replace_cols_name(_, group_9)

### === Show group ===

In [None]:
group_result(group_10)

### Replace group

In [None]:
x_year = 2019

# IBM Cloud / Red Hat
base_info = ('Q29_Part_4', 'IBM Cloud / Red Hat')
col_info = ('Q29_Part_4', 'IBM Cloud')
_ = group_replace(col_info, base_info, x_year)

upd_cols_info = [
    ('Q29_Part_10', 'Red Hat Cloud')    
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q29_Part_12', 'Other')
upd_cols_info = [
    ('Q29_Part_5', 'Alibaba Cloud'),
    ('Q29_Part_6', 'Salesforce Cloud'),
    ('Q29_Part_9', 'VMware Cloud')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# Other
base_info = ('Q26_A_OTHER', 'Other')
upd_cols_info = [
    ('Q26_A_Part_7', 'Salesforce Cloud'),
    ('Q26_A_Part_8', 'VMware Cloud'),
    ('Q26_A_Part_9', 'Alibaba Cloud'),
    ('Q26_A_Part_10', 'Tencent Cloud')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q27_A_OTHER', 'Other')
upd_cols_info = [
    ('Q27_A_Part_7', 'Salesforce Cloud'),
    ('Q27_A_Part_8', 'VMware Cloud'),
    ('Q27_A_Part_9', 'Alibaba Cloud'),
    ('Q27_A_Part_10', 'Tencent Cloud')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_10][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_10)

### === Show group ===

In [None]:
group_result(group_11)

### Check group

### Replace column names

In [None]:
_ = group_cols_diff(group_questions[group_11][1])
_
replace_cols_name(_, group_11)

### === Show group ===

In [None]:
group_result(group_12)

### === Show group ===

In [None]:
group_result(group_13)

### === Show group ===

In [None]:
group_result(group_14)

### Replace group

In [None]:
x_year = 2020

# PostgreSQL
base_info = ('Q29_A_Part_2', 'PostgreSQL')
col_info = ('Q29_A_Part_2', 'PostgresSQL')
_ = group_replace(col_info, base_info, x_year)

# Microsoft (SQL Server, Azure Database, Storage, etc)
base_info = ('Q29_A_Part_8', 'Microsoft (SQL Server, Azure Database, Storage, etc)')
col_info = ('Q29_A_Part_8', 'Microsoft SQL Server')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_A_Part_10', 'Microsoft Azure Data Lake Storage'),
    ('Q29_A_Part_9', 'Microsoft Access')
]
group_update(base_info, upd_cols_info, x_year)

# Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)
base_info = ('Q29_A_Part_11', 'Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)')
col_info = ('Q29_A_Part_11', 'Amazon Redshift')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_A_Part_12', 'Amazon Athena'),
    ('Q29_A_Part_13', 'Amazon DynamoDB')
]
group_update(base_info, upd_cols_info, x_year)

# Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)
base_info = ('Q29_A_Part_14', 'Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)')
col_info = ('Q29_A_Part_14', 'Google Cloud BigQuery')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_A_Part_15', 'Google Cloud SQL'),
    ('Q29_A_Part_16', 'Google Cloud Firestore')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q29_A_OTHER', 'Other')
upd_cols_info = [
    ('Q29_A_Part_6', 'Snowflake'),
    ('Q29_A_Part_7', 'IBM Db2')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Microsoft (SQL Server, Azure Database, Storage, etc)
base_info = ('Q32_A_Part_8', 'Microsoft (SQL Server, Azure Database, Storage, etc)')
col_info = ('Q32_A_Part_8', 'Microsoft SQL Server')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_A_Part_9', 'Microsoft Azure SQL Database'),
    ('Q32_A_Part_10', 'Microsoft Azure Cosmos DB')
]
group_update(base_info, upd_cols_info, x_year)

# Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)
base_info = ('Q32_A_Part_11', 'Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)')
col_info = ('Q32_A_Part_11', 'Amazon Redshift')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_A_Part_12', 'Amazon Aurora'),
    ('Q32_A_Part_13', 'Amazon RDS'),
    ('Q32_A_Part_14', 'Amazon DynamoDB')
]
group_update(base_info, upd_cols_info, x_year)

# Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)
base_info = ('Q32_A_Part_15', 'Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)')
col_info = ('Q32_A_Part_15', 'Google Cloud BigQuery')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_A_Part_16', 'Google Cloud SQL'),
    ('Q32_A_Part_17', 'Google Cloud Firestore'),
    ('Q32_A_Part_18', 'Google Cloud BigTable'),
    ('Q32_A_Part_19', 'Google Cloud Spanner')
]
group_update(base_info, upd_cols_info, x_year)
    
# Other
base_info = ('Q32_A_OTHER', 'Other')
upd_cols_info = [
    ('Q32_A_Part_6', 'Snowflake'),
    ('Q32_A_Part_7', 'IBM Db2')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_14][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_14)

### === Show group ===

In [None]:
group_result(group_15)

### Replace group

In [None]:
x_year = 2020

# PostgreSQL
base_info = ('Q29_B_Part_2', 'PostgreSQL')
col_info = ('Q29_B_Part_2', 'PostgresSQL')
_ = group_replace(col_info, base_info, x_year)

# Microsoft (SQL Server, Azure Database, Storage, etc)
base_info = ('Q29_B_Part_8', 'Microsoft (SQL Server, Azure Database, Storage, etc)')
col_info = ('Q29_B_Part_8', 'Microsoft SQL Server')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_B_Part_10', 'Microsoft Azure Data Lake Storage'),
    ('Q29_B_Part_9', 'Microsoft Access')
]
group_update(base_info, upd_cols_info, x_year)

# Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)
base_info = ('Q29_B_Part_11', 'Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)')
col_info = ('Q29_B_Part_11', 'Amazon Redshift')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_B_Part_12', 'Amazon Athena'),
    ('Q29_B_Part_13', 'Amazon DynamoDB')
]
group_update(base_info, upd_cols_info, x_year)

# Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)
base_info = ('Q29_B_Part_14', 'Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)')
col_info = ('Q29_B_Part_14', 'Google Cloud BigQuery')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q29_B_Part_15', 'Google Cloud SQL'),
    ('Q29_B_Part_16', 'Google Cloud Firestore')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q29_B_OTHER', 'Other')
upd_cols_info = [
    ('Q29_B_Part_6', 'Snowflake'),
    ('Q29_B_Part_7', 'IBM Db2')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Microsoft (SQL Server, Azure Database, Storage, etc)
base_info = ('Q32_B_Part_8', 'Microsoft (SQL Server, Azure Database, Storage, etc)')
col_info = ('Q32_B_Part_8', 'Microsoft SQL Server')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_B_Part_9', 'Microsoft Azure SQL Database'),
    ('Q32_B_Part_10', 'Microsoft Azure Cosmos DB')
]
group_update(base_info, upd_cols_info, x_year)

# Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)
base_info = ('Q32_B_Part_11', 'Amazon (Redshift, Aurora, RDS, DynamoDB, Athena, etc)')
col_info = ('Q32_B_Part_11', 'Amazon Redshift')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_B_Part_12', 'Amazon Aurora'),
    ('Q32_B_Part_14', 'Amazon RDS'),
    ('Q32_B_Part_13', 'Amazon DynamoDB')
]
group_update(base_info, upd_cols_info, x_year)

# Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)
base_info = ('Q32_B_Part_15', 'Google (Cloud BigQuery, SQL, Firestore, BigTable, etc)')
col_info = ('Q32_B_Part_15', 'Google Cloud BigQuery')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q32_B_Part_16', 'Google Cloud SQL'),
    ('Q32_B_Part_17', 'Google Cloud Firestore'),
    ('Q32_B_Part_18', 'Google Cloud BigTable'),
    ('Q32_B_Part_19', 'Google Cloud Spanner')
]
group_update(base_info, upd_cols_info, x_year)

# Other
base_info = ('Q32_B_OTHER', 'Other')
upd_cols_info = [
    ('Q32_B_Part_6', 'Snowflake'),
    ('Q32_B_Part_7', 'IBM Db2')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_15][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_15)

### === Show group ===

In [None]:
group_result(group_16)

### Replace group

In [None]:
x_year = 2020

# Other
base_info = ('Q31_A_OTHER', 'Other')
upd_cols_info = [
    ('Q31_A_Part_4', 'Looker'),
    ('Q31_A_Part_9', 'Domo'),
    ('Q31_A_Part_10', 'TIBCO Spotfire'),
    ('Q31_A_Part_11', 'Alteryx'),
    ('Q31_A_Part_12', 'Sisense'),
    ('Q31_A_Part_13', 'SAP Analytics Cloud'),
    ('Q31_A_Part_7', 'Einstein Analytics')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q34_A_OTHER', 'Other')
upd_cols_info = [
    ('Q34_A_Part_4', 'Looker'),
    ('Q34_A_Part_7', 'Tableau CRM'),
    ('Q34_A_Part_9', 'Domo'),
    ('Q34_A_Part_10', 'TIBCO Spotfire'),
    ('Q34_A_Part_11', 'Alteryx'),
    ('Q34_A_Part_12', 'Sisense'),
    ('Q34_A_Part_13', 'SAP Analytics Cloud'),
    ('Q34_A_Part_14', 'Microsoft Azure Synapse'),
    ('Q34_A_Part_15', 'Thoughtspot')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_16][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_16)

### === Show group ===

In [None]:
group_result(group_17)

### Replace group

In [None]:
x_year = 2020

# Other
base_info = ('Q31_B_OTHER', 'Other')
upd_cols_info = [
    ('Q31_B_Part_4', 'Looker'),
    ('Q31_B_Part_9', 'Domo'),
    ('Q31_B_Part_10', 'TIBCO Spotfire'),
    ('Q31_B_Part_11', 'Alteryx'),
    ('Q31_B_Part_12', 'Sisense'),
    ('Q31_B_Part_13', 'SAP Analytics Cloud'),
    ('Q31_B_Part_7', 'Einstein Analytics')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q34_B_OTHER', 'Other')
upd_cols_info = [
    ('Q34_B_Part_4', 'Looker'),
    ('Q34_B_Part_7', 'Tableau CRM'),
    ('Q34_B_Part_9', 'Domo'),
    ('Q34_B_Part_10', 'TIBCO Spotfire'),
    ('Q34_B_Part_11', 'Alteryx'),
    ('Q34_B_Part_12', 'Sisense'),
    ('Q34_B_Part_13', 'SAP Analytics Cloud'),
    ('Q34_B_Part_14', 'Microsoft Azure Synapse'),
    ('Q34_B_Part_15', 'Thoughtspot')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_17][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_17)

### === Show group ===

In [None]:
group_result(group_18)

### Replace group

In [None]:
x_year = 2020

# Google AutoML, H2O Driverless AI)
base_info = ('Q33_A_Part_6', 'Automation of full ML pipelines (e.g. Google AutoML, H2O Driverless AI)')
col_info = ('Q33_A_Part_6', 'Automation of full ML pipelines \(e\.g\. Google AutoML, H20 Driverless AI\)')
_ = group_replace(col_info, base_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_18][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_18)

### === Show group ===

In [None]:
group_result(group_19)

### Replace group

In [None]:
x_year = 2020

# Google AutoML, H2O Driverless AI)
base_info = ('Q33_B_Part_6', 'Automation of full ML pipelines (e.g. Google Cloud AutoML, H2O Driverless AI)')
col_info = ('Q33_B_Part_6', 'Automation of full ML pipelines \(e\.g\. Google Cloud AutoML, H20 Driverless AI\)')
_ = group_replace(col_info, base_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_19][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_19)

### === Show group ===

In [None]:
group_result(group_20)

### Replace group

In [None]:
x_year = 2020

# H2O Driverless AI
base_info = ('Q34_A_Part_2', 'H2O Driverless AI')
col_info = ('Q34_A_Part_2', 'H20 Driverless AI')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q34_A_OTHER', 'Other')
upd_cols_info = [
    ('Q34_A_Part_5', 'Tpot'),
    ('Q34_A_Part_6', 'Auto-Keras'),
    ('Q34_A_Part_7', 'Auto-Sklearn'),
    ('Q34_A_Part_8', 'Auto_ml'),
    ('Q34_A_Part_9', 'Xcessiv'),
    ('Q34_A_Part_10', 'MLbox')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q37_A_OTHER', 'Other')
upd_cols_info = [
    ('Q37_A_Part_5', 'Amazon Sagemaker Autopilot'),
    ('Q37_A_Part_6', 'Azure Automated Machine Learning')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_20][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_20)

### === Show group ===

In [None]:
group_result(group_21)

### Replace group

In [None]:
x_year = 2020

# H2O Driverless AI
base_info = ('Q34_B_Part_2', 'H2O Driverless AI')
col_info = ('Q34_B_Part_2', 'H20 Driverless AI')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q34_B_OTHER', 'Other')
upd_cols_info = [
    ('Q34_B_Part_5', 'Tpot'),
    ('Q34_B_Part_6', 'Auto-Keras'),
    ('Q34_B_Part_7', 'Auto-Sklearn'),
    ('Q34_B_Part_8', 'Auto_ml'),
    ('Q34_B_Part_9', 'Xcessiv'),
    ('Q34_B_Part_10', 'MLbox')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q37_B_OTHER', 'Other')
upd_cols_info = [
    ('Q37_B_Part_5', 'Amazon Sagemaker Autopilot'),
    ('Q37_B_Part_6', 'Azure Automated Machine Learning')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_21][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_21)

### === Show group ===

In [None]:
group_result(group_22)

### === Show group ===

In [None]:
group_result(group_23)

### === Show group ===

In [None]:
group_result(group_24)

### Replace column names

In [None]:
_ = group_cols_diff(group_questions[group_24][1])
replace_cols_name(_, group_24)

### === Show group ===

In [None]:
group_result(group_25)

### Replace group

In [None]:
x_year = 2019

# Kaggle Learn Courses
base_info = ('Q13_Part_6', 'Kaggle Learn Courses')
col_info = ('Q13_Part_6', 'Kaggle Courses \(i\.e\. Kaggle Learn\)')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q13_Part_12', 'Other')
upd_cols_info = [
    ('Q13_Part_5', 'DataQuest')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

# Other
base_info = ('Q37_OTHER', 'Other')
upd_cols_info = [
    ('Q37_Part_9', 'Cloud-certification programs \(direct from AWS, Azure, GCP, or similar\)')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

# Other
base_info = ('Q40_OTHER', 'Other')
upd_cols_info = [
    ('Q40_Part_9', 'Cloud-certification programs \(direct from AWS, Azure, GCP, or similar\)')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_25][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_25)

### === Show group ===

In [None]:
group_result(group_26)

### Replace group

In [None]:
x_year = 2018

base_info = ('Q38_Part_3', 'Reddit')
col_info = ('Q38_Part_3', 'r/machinelearning')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q38_Part_4', 'Kaggle')
col_info = ('Q38_Part_4', 'Kaggle forums')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q38_Part_5', 'Course Forums')
col_info = ('Q38_Part_5', 'Fastai forums')
_ = group_replace(col_info, base_info, x_year)

# YouTube
base_info = ('Q38_Part_6', 'YouTube')
col_info = ('Q38_Part_6', 'Siraj Raval YouTube Channel')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q38_Part_9', 'Cloud AI Adventures \(YouTube\)')
]
group_update(base_info, upd_cols_info, x_year)

# Podcasts
base_info = ('Q38_Part_8', 'Podcasts')
col_info = ('Q38_Part_8', 'Linear Digressions Podcast')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q38_Part_16', 'Partially Derivative Podcast'),
    ('Q38_Part_17', 'The Data Skeptic Podcast')
]
group_update(base_info, upd_cols_info, x_year)

# Blogs
base_info = ('Q38_Part_13', 'Blogs')
col_info = ('Q38_Part_13', 'FastML Blog')
_ = group_replace(col_info, base_info, x_year)
upd_cols_info = [
    ('Q38_Part_14', 'KDnuggets Blog'),
    ('Q38_Part_18', 'Medium Blog Posts'),
    ('Q38_Part_19', 'Towards Data Science Blog'),
    ('Q38_Part_20', 'Analytics Vidhya Blog')
]
group_update(base_info, upd_cols_info, x_year)

base_info = ('Q38_Part_12', 'Journals')
col_info = ('Q38_Part_12', 'Journal Publications')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q38_Part_21', 'None')
col_info = ('Q38_Part_21', 'None/I do not know')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q38_Part_22', 'Other')
upd_cols_info = [
    ('Q38_Part_2', 'Hacker News'),
    ('Q38_Part_7', 'DataTau News Aggregator'),
    ('Q38_Part_10', 'FiveThirtyEight\.com'),
    ('Q38_Part_11', 'ArXiv & Preprints'),
    ('Q38_Part_15', "O'Reilly Data Newsletter")
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2019

base_info = ('Q12_Part_1', 'Twitter')
col_info = ('Q12_Part_1', 'Twitter \(data science influencers\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_3', 'Reddit')
col_info = ('Q12_Part_3', 'Reddit \(r/machinelearning, r/datascience, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_4', 'Kaggle')
col_info = ('Q12_Part_4', 'Kaggle \(forums, blog, social media, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_5', 'Course Forums')
col_info = ('Q12_Part_5', 'Course Forums \(forums\.fast\.ai, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_6', 'YouTube')
col_info = ('Q12_Part_6', 'YouTube \(Cloud AI Adventures, Siraj Raval, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_7', 'Podcasts')
col_info = ('Q12_Part_7', 'Podcasts \(Chai Time Data Science, Linear Digressions, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_8', 'Blogs')
col_info = ('Q12_Part_8', 'Blogs \(Towards Data Science, Medium, Analytics Vidhya, KDnuggets etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q12_Part_9', 'Journals')
col_info = ('Q12_Part_9', 'Journal Publications \(traditional publications, preprint journals, etc\)')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q12_Part_12', 'Other')
upd_cols_info = [
    ('Q12_Part_10', 'Slack Communities \(ods\.ai, kagglenoobs, etc\)'),
    ('Q12_Part_2', 'Hacker News \(https\://news\.ycombinator\.com/\)')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2020

base_info = ('Q39_Part_1', 'Twitter')
col_info = ('Q39_Part_1', 'Twitter \(data science influencers\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_3', 'Reddit')
col_info = ('Q39_Part_3', 'Reddit \(r/machinelearning, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_4', 'Kaggle')
col_info = ('Q39_Part_4', 'Kaggle \(notebooks, forums, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_5', 'Course Forums')
col_info = ('Q39_Part_5', 'Course Forums \(forums\.fast\.ai, Coursera forums, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_6', 'YouTube')
col_info = ('Q39_Part_6', 'YouTube \(Kaggle YouTube, Cloud AI Adventures, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_7', 'Podcasts')
col_info = ('Q39_Part_7', 'Podcasts \(Chai Time Data Science, O’Reilly Data Show, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_8', 'Blogs')
col_info = ('Q39_Part_8', 'Blogs \(Towards Data Science, Analytics Vidhya, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q39_Part_9', 'Journals')
col_info = ('Q39_Part_9', 'Journal Publications \(peer-reviewed journals, conference proceedings, etc\)')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q39_OTHER', 'Other')
upd_cols_info = [
    ('Q39_Part_2', "Email newsletters \(Data Elixir, O'Reilly Data & AI, etc\)"),
    ('Q39_Part_10', 'Slack Communities \(ods\.ai, kagglenoobs, etc\)')
]
group_update(base_info, upd_cols_info, x_year)

In [None]:
x_year = 2021

base_info = ('Q42_Part_1', 'Twitter')
col_info = ('Q42_Part_1', 'Twitter \(data science influencers\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_3', 'Reddit')
col_info = ('Q42_Part_3', 'Reddit \(r/machinelearning, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_4', 'Kaggle')
col_info = ('Q42_Part_4', 'Kaggle \(notebooks, forums, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_5', 'Course Forums')
col_info = ('Q42_Part_5', 'Course Forums \(forums\.fast\.ai, Coursera forums, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_6', 'YouTube')
col_info = ('Q42_Part_6', 'YouTube \(Kaggle YouTube, Cloud AI Adventures, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_7', 'Podcasts')
col_info = ('Q42_Part_7', 'Podcasts \(Chai Time Data Science, O’Reilly Data Show, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_8', 'Blogs')
col_info = ('Q42_Part_8', 'Blogs \(Towards Data Science, Analytics Vidhya, etc\)')
_ = group_replace(col_info, base_info, x_year)

base_info = ('Q42_Part_9', 'Journals')
col_info = ('Q42_Part_9', 'Journal Publications \(peer-reviewed journals, conference proceedings, etc\)')
_ = group_replace(col_info, base_info, x_year)

# Other
base_info = ('Q42_OTHER', 'Other')
upd_cols_info = [
    ('Q42_Part_2', "Email newsletters \(Data Elixir, O'Reilly Data & AI, etc\)"),
    ('Q42_Part_10', 'Slack Communities \(ods\.ai, kagglenoobs, etc\)')
]
group_update(base_info, upd_cols_info, x_year)

### Check group

In [None]:
_ = group_cols_diff(group_questions[group_26][1])
_

### Replace column names

In [None]:
replace_cols_name(_, group_26)

# 4. Create main dataset

In [None]:
concat_data = []
concat_keys = []

for x_year in datasets.keys():
    selected_cols = datasets.get(x_year).filter(regex="SA|GA", axis=1) \
                                        .columns.to_list()
    concat_data.append(datasets.get(x_year).loc[1: , selected_cols])
    concat_keys.append(x_year)

concated_data = pd.concat(concat_data, keys=concat_keys)

In [None]:
dataset_head= {}

for col_name in concated_data.columns:
    if col_name not in dataset_head:
        for x_year in datasets.keys():
            if col_name in datasets.get(x_year).columns:
                dataset_head[col_name] = datasets.get(x_year).loc[:0, col_name]
                break

dataset_head = pd.DataFrame(dataset_head)

In [None]:
dataset = concated_data.reset_index(level=0) \
                            .rename(columns={'level_0': 'Year'}) \
                                .reset_index(drop=True)
dataset_description = pd.DataFrame({"Year": ["Dataset year"]}).join(dataset_head)

In [None]:
print(dataset.shape)
print(dataset_description.shape)

In [None]:
dataset_description

# 5. Additional changes

In [None]:
# Col: 'Duration'
_ = {'SA0': 'Duration'}
dataset.rename(columns=_, inplace=True)
dataset_description.rename(columns=_, inplace=True)

if pd.api.types.is_string_dtype(dataset['Duration']):
    dataset['Duration'] = (dataset['Duration'].astype(int) / 60).round(2)
    dataset_description['Duration'] = 'Duration (in minutes)'

In [None]:
# Cols: Strip all SAGA cols
saga_cols = dataset.filter(regex="SA|GA", axis=1).columns.to_list()
dataset[saga_cols] = dataset[saga_cols].apply(lambda col: col.str.strip())

# Col: Age
age_col = "SA1"
_ = {'70-79': '70+', '80+': '70+'}
dataset[age_col].replace(_, inplace=True)

# Col: Gender
gender_col = "SA2"
_ = {'Male': 'Man', 'Female': 'Woman'}
dataset[gender_col].replace(_, inplace=True)

# Col: Country
country_col = "SA3"
selected_threshold = 35
if dataset[country_col].nunique() > selected_threshold:
    all_countries = pd.DataFrame(dataset[country_col].value_counts()).T
    _ = all_countries.pop('Other')
    popular_countries = all_countries.iloc[:, :selected_threshold -1].columns.to_list()
    other_countries = [x_country for x_country in all_countries
                       if x_country not in popular_countries]

    dataset[country_col] = dataset[country_col].replace(other_countries, 'Other')
    
_ = {'United States of America': 'USA',
     'United Kingdom of Great Britain and Northern Ireland': 'Britain',
     'Iran, Islamic Republic of...': 'Iran'}
dataset[country_col].replace(_, inplace=True)

# Col: Degree or not degree
degree_col = "SA4"
_ = {'Professional doctorate': 'Professional degree',
     'No formal education past high school': 'Other',
     'Some college/university study without earning a bachelor’s degree': 'Other'}
dataset[degree_col].replace(_, inplace=True)

# Col: Employed
employed_col = "SA5"
_ = {'Data Analyst': 'Analyst (Data, Business, etc)',
     'Business Analyst': 'Analyst (Data, Business, etc)',
     'Marketing Analyst': 'Analyst (Data, Business, etc)',
     'Manager': 'Manager (Product, Project, etc)',
     'Product Manager': 'Manager (Product, Project, etc)',
     'Product/Project Manager': 'Manager (Product, Project, etc)',
     'Program/Project Manager': 'Manager (Product, Project, etc)',
     'Research Scientist': 'Research Scientist (Assistant)',
     'Research Assistant': 'Research Scientist (Assistant)',     
     'DBA/Database Engineer': 'Other',
     'Data Scientist': 'Data Scientist (ML Engineer)',
     'Machine Learning Engineer': 'Data Scientist (ML Engineer)',
     'Currently not employed': 'Not employed',
     'Chief Officer': 'Other',
     'Consultant': 'Other',
     'Principal Investigator': 'Other',
     'Salesperson': 'Other',
     'Data Journalist': 'Other',
     'Developer Advocate': 'Other',
     'Developer Relations/Advocacy': 'Other'
}
dataset[employed_col].replace(_, inplace=True)

# Col: Coding time
coding_time_col = "SA6"
_ = {'< 1 year': '< 1 years',
     '1-2 years': '1-3 years',
     '20-30 years': '20+ years',
     '30-40 years': '20+ years',
     '40+ years': '20+ years',
     'I have never written code but I want to learn': 'I have never written code',
     'I have never written code and I do not want to learn': 'I have never written code'
}
dataset[coding_time_col].replace(_, inplace=True)

# Col: Coding lang
coding_lang_col = "SA7"
_ = {'C': 'C/C++',
     'C++': 'C/C++',
     'Go': 'Other',
     'Swift': 'Other',
     'Scala': 'Other',
     'Bash': 'Other',
     'Julia': 'Other',
     'TypeScript': 'Other',
     'SAS': 'Other',
     'VBA': 'Other'
}
dataset[coding_lang_col].replace(_, inplace=True)

# Col: Device
device_col = "SA8"
_ = {'A laptop': 'A personal computer',
     'A personal computer or laptop': 'A personal computer',
     'A personal computer / desktop': 'A personal computer',
     'A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)': 'A cloud computing platform',
     'A deep learning workstation (NVIDIA GTX, LambdaLabs, etc)': 'A deep learning workstation'
}
dataset[device_col].replace(_, inplace=True)

# Col: Used TPU
used_tpu_col = "SA9"
_ = {'More than 25 times': '25+ times'
}
dataset[used_tpu_col].replace(_, inplace=True)

# Col: Used ML methods
used_ml_col = "SA10"
_ = {'Under 1 year': '< 1 years',
     '< 1 year': '< 1 years',
     '10-15 years': '10+ years',
     '10-20 years': '10+ years',
     '20+ years': '10+ years',
     '20 or more years': '10+ years',
     
     'I do not use machine learning methods': 'I do not use ML',
     'I have never studied machine learning and I do not plan to': 'I do not use ML',
     'I have never studied machine learning but plan to learn in the future': 'I do not use ML'
}
dataset[used_ml_col].replace(_, inplace=True)

# Col: Industry
industry_col = "SA11"
_ = {'I am a student': np.nan
}
dataset[industry_col].replace(_, inplace=True)
# Year    Notna
# 2018    21685  <<< del 4658 'I am a student'
# 2021    16325

# Col: Size company
company_col = "SA12"
_ = {'10,000 or more employees': '10,000+ employees',
     '> 10,000 employees': '10,000+ employees'
}
dataset[company_col].replace(_, inplace=True)

# Col: Workloads
workloads_col = "SA13"
_ = {'10-14': '10-19',
     '15-19': '10-19'
}
dataset[workloads_col].replace(_, inplace=True)

# Col: Compensation
compensation_col = "SA15"
_ = {'I do not wish to disclose my approximate yearly compensation': np.nan,
     '$0-999': '0-10,000',
     '1,000-1,999': '0-10,000',
     '2,000-2,999': '0-10,000',
     '3,000-3,999': '0-10,000',
     '4,000-4,999': '0-10,000',
     '5,000-7,499': '0-10,000',
     '7,500-9,999': '0-10,000',

     '10,000-14,999': '10,000-50,000',
     '15,000-19,999': '10,000-50,000',
     '10-20,000': '10,000-50,000',
     '20,000-24,999': '10,000-50,000',
     '20-30,000': '10,000-50,000',
     '25,000-29,999': '10,000-50,000',
     '30,000-39,999': '10,000-50,000',
     '30-40,000': '10,000-50,000',
     '40,000-49,999': '10,000-50,000',
     '40-50,000': '10,000-50,000',
     
     '50,000-59,999': '50,000-100,000',
     '50-60,000': '50,000-100,000',
     '60,000-69,999': '50,000-100,000',
     '60-70,000': '50,000-100,000',
     '70,000-79,999': '50,000-100,000',
     '70-80,000': '50,000-100,000',
     '80,000-89,999': '50,000-100,000',
     '80-90,000': '50,000-100,000',
     '90,000-99,999': '50,000-100,000',
     '90-100,000': '50,000-100,000',
     
     '100,000-124,999': '100,000-150,000',
     '100-125,000': '100,000-150,000',
     
     '125,000-149,999': '100,000-150,000',
     '125-150,000': '100,000-150,000',
     
     '150,000-199,999': '150,000-200,000',
     '150-200,000': '150,000-200,000',
     
     '200,000-249,999': '200,000-250,000',
     '200-250,000': '200,000-250,000',
     
     '250,000-299,999': '250,000-300,000',
     '250-300,000': '250,000-300,000',
     
     '300,000-499,999': '300,000-500,000',
     '300,000-500,000': '300,000-500,000',
     '300-400,000': '300,000-500,000',
     '400-500,000': '300,000-500,000',
     
     '$500,000-999,999': '500,000+',
     '> $500,000': '500,000+',
     '>$1,000,000': '500,000+'
}
dataset[compensation_col].replace(_, inplace=True)

# Col: Use products
use_products_col = "SA18"
_ = {'Amazon Athena': 'Amazon (Redshift, RDS, Athena, Aurora, etc)',
     'Amazon Aurora': 'Amazon (Redshift, RDS, Athena, Aurora, etc)',
     'Amazon DynamoDB': 'Amazon (Redshift, RDS, Athena, Aurora, etc)',
     'Amazon RDS': 'Amazon (Redshift, RDS, Athena, Aurora, etc)',
     'Amazon Redshift': 'Amazon (Redshift, RDS, Athena, Aurora, etc)',
     
     'Google Cloud BigQuery': 'Google Cloud (BigQuery, BigTable, SQL, etc)',
     'Google Cloud BigTable': 'Google Cloud (BigQuery, BigTable, SQL, etc)',
     'Google Cloud Firestore': 'Google Cloud (BigQuery, BigTable, SQL, etc)',
     'Google Cloud SQL': 'Google Cloud (BigQuery, BigTable, SQL, etc)',
     'Google Cloud Spanner': 'Google Cloud (BigQuery, BigTable, SQL, etc)',
     
     'Microsoft SQL Server': 'Microsoft (SQL Server, Access, Azure, etc)',
     'Microsoft Access': 'Microsoft (SQL Server, Access, Azure, etc)',
     'Microsoft Azure Cosmos DB': 'Microsoft (SQL Server, Access, Azure, etc)',
     'Microsoft Azure Data Lake Storage': 'Microsoft (SQL Server, Access, Azure, etc)',
     'Microsoft Azure SQL Database': 'Microsoft (SQL Server, Access, Azure, etc)',
     
     'PostgresSQL': 'PostgreSQL'
}
dataset[use_products_col].replace(_, inplace=True)

# Col: Use business intelligence tools
use_bi_col = "SA19"
_ = {'TIBCO Spotfire': 'Other',
     'Tableau CRM': 'Other',
     'Looker': 'Other',
     'Microsoft Azure Synapse': 'Other',
     'Domo': 'Other',
     'Sisense': 'Other',
     'Einstein Analytics': 'Other',
     'Thoughtspot': 'Other'
}
dataset[use_bi_col].replace(_, inplace=True)

# Col: Use tool to analyze data
use_tool_col = "SA20"
_ = {'Local or hosted development environments (RStudio, JupyterLab, etc.)':
         'Local development environments (RStudio, JupyterLab, etc.)'}
dataset[use_tool_col].replace(_, inplace=True)

# 6. Saving information

In [None]:
def save_to_json(result, file_name, dir_to_save='info_data'):
    if not os.path.exists(dir_to_save) and dir_to_save.isidentifier():
        os.mkdir(dir_to_save)

    if os.path.isdir(dir_to_save):
        path_to_file = os.path.join(dir_to_save, file_name)
    else:
        path_to_file = file_name
        
    with open(path_to_file, "w") as json_file:
        json.dump(result, json_file, indent=4)

In [None]:
# save_to_json(columns_info, "columns_info.json")
# save_to_json(datasets_info, "datasets_info.json")
# save_to_json(questions_info, "questions_info.json")

# dataset.to_csv("kaggle_survey_2018-2021_data.csv", index=False)
# dataset_description.to_csv("kaggle_survey_2018-2021_header.csv", index=False)

# 7. Simple data analysis

In [None]:
dataset.info(memory_usage='deep', verbose=False)

## 7.1 Single answers

In [None]:
sa_cols = dataset.filter(like="SA").columns.to_list()
sa_cols_stats = dataset.groupby('Year').count()[sa_cols].T \
                        .join(dataset_description[sa_cols].T)
sa_cols_stats

In [None]:
dataset[sa_cols].describe().T

## 7.2 Group answers

In [None]:
def group_stats(group_number, is_year=False):
    find_group = "GA" + str(group_number) + "_"
    ga_cols = dataset.filter(like=find_group).columns.to_list()
    
    if ga_cols:
        questions = questions_info.get('group questions')[group_number][1]
        question = questions[0]
        print("\nGroup question': {}\n".format(question))
        
        info_stats = dataset.filter(like=find_group).describe().T
        year_stats = dataset.groupby('Year').count()[ga_cols].T \
                                    .join(info_stats[['top', 'count']]) \
                                        .rename(columns={'top': 'answer'})
        if is_year == True:
            return year_stats
        else:
            return info_stats

In [None]:
group_stats(group_26)

In [None]:
group_stats(group_26, is_year=True).sort_values(by='count', ascending=False)

In [None]:
# ...