---
### Import zones

---

In [1]:
import pandas as pd
import numpy as np
import os 

---
### Load datasets

---

In [3]:
df_2018 = pd.read_csv('../data/clean_stackoverflow/clean_2018.csv')
df_2019 = pd.read_csv('../data/clean_stackoverflow/clean_2019.csv')
df_2020 = pd.read_csv('../data/clean_stackoverflow/clean_2020.csv')
df_2021 = pd.read_csv('../data/clean_stackoverflow/clean_2021.csv')
df_2022 = pd.read_csv('../data/clean_stackoverflow/clean_2022.csv')

---
### Concat 5 datasets

---

In [4]:
dfs = [df_2018, df_2019, df_2020, df_2021, df_2022]
df = pd.concat(dfs, ignore_index=True)


---
### Normalize some data

---
Gender column has too many (similar) options. Cleaned up

---

In [5]:
# first, replace nan with 'Prefer not to say'
df['Gender'].fillna('Prefer not to say', inplace=True)


gender_dict = {
    'Male':'Male', 
    'Prefer not to say':'Prefer not to say', 
    'Female':'Female',
    'Female;Male;Transgender;Non-binary, genderqueer, or gender non-conforming':'Non-binary;Transgender',
    'Female;Male':'Non-binary',
    'Male;Non-binary, genderqueer, or gender non-conforming':'Male;Transgender',
    'Non-binary, genderqueer, or gender non-conforming':'Non-binary', 
    'Transgender':'Transgender',
    'Female;Transgender':'Female;Transgender',
    'Female;Non-binary, genderqueer, or gender non-conforming':'Female;Transgender',
    'Transgender;Non-binary, genderqueer, or gender non-conforming':'Transgender',
    'Female;Transgender;Non-binary, genderqueer, or gender non-conforming':'Female;Transgender',
    'Male;Transgender':'Male;Transgender', 
    'Female;Male;Transgender':'Non-binary;Transgender',
    'Male;Transgender;Non-binary, genderqueer, or gender non-conforming':'Male;Transgender',
    'Female;Male;Non-binary, genderqueer, or gender non-conforming':'Non-binary',
    'Man':'Male', 
    'Woman':'Female',
    'Woman;Non-binary, genderqueer, or gender non-conforming':'Female;Transgender',
    'Man;Non-binary, genderqueer, or gender non-conforming':'Male;Transgender',
    'Woman;Man;Non-binary, genderqueer, or gender non-conforming':'Non-binary',
    'Woman;Man':'Non-binary', 
    'Man;Or, in your own words:':'Male',
    'Or, in your own words:':'Prefer not to say', 
    'Man;Woman':'Non-binary',
    'Woman;Or, in your own words:':'Female',
    'Man;Woman;Non-binary, genderqueer, or gender non-conforming':'Non-binary',
    'Man;Non-binary, genderqueer, or gender non-conforming;Or, in your own words:':'Male;Transgender',
    'Man;Woman;Non-binary, genderqueer, or gender non-conforming;Or, in your own words:':'Non-binary',
    'Non-binary, genderqueer, or gender non-conforming;Or, in your own words:':'Non-binary',
    'Woman;Non-binary, genderqueer, or gender non-conforming;Or, in your own words:':'Female;Transgender',
    'Man;Or, in your own words:;Woman;Non-binary, genderqueer, or gender non-conforming':'Non-binary;Transgender',
    'Or, in your own words:;Woman;Non-binary, genderqueer, or gender non-conforming':'Female;Transgender',
    'Or, in your own words:;Non-binary, genderqueer, or gender non-conforming':'Non-binary',
    'Man;Or, in your own words:;Non-binary, genderqueer, or gender non-conforming':'Male;Transgender',
    'Or, in your own words:;Woman':'Female'
}

# aplly dictionary
df['Gender'] = df['Gender'].map(gender_dict)
df['Gender'].unique()

array(['Male', 'Prefer not to say', 'Female', 'Non-binary;Transgender',
       'Non-binary', 'Male;Transgender', 'Transgender',
       'Female;Transgender'], dtype=object)

In [6]:
df['Sexuality'].unique()

array(['Straight or heterosexual', nan, 'Bisexual or Queer',
       'Gay or Lesbian', 'Straight or heterosexual;Bisexual or Queer',
       'Straight or heterosexual;Asexual', 'Asexual',
       'Gay or Lesbian;Bisexual or Queer', 'Bisexual or Queer;Asexual',
       'Straight or heterosexual;Gay or Lesbian;Bisexual or Queer;Asexual',
       'Straight or heterosexual;Gay or Lesbian;Bisexual or Queer',
       'Straight or heterosexual;Gay or Lesbian',
       'Straight or heterosexual;Bisexual or Queer;Asexual',
       'Gay or Lesbian;Asexual',
       'Gay or Lesbian;Bisexual or Queer;Asexual',
       'Straight / Heterosexual', 'Bisexual',
       'Bisexual;Straight / Heterosexual', 'Bisexual;Gay or Lesbian',
       'Bisexual;Gay or Lesbian;Straight / Heterosexual',
       'Gay or Lesbian;Straight / Heterosexual', 'Bisexual;Queer',
       'Gay or Lesbian;Queer', 'Queer',
       'Bisexual;Gay or Lesbian;Straight / Heterosexual;Queer',
       'Straight / Heterosexual;Queer', 'Bisexual;Gay or L

In [7]:
df['Sexuality'].replace('Straight or heterosexual','Straight',inplace=True)
df['Sexuality'].replace('Bisexual or Queer','Bisexual',inplace=True)
df['Sexuality'].replace('Gay or Lesbian','Gay',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Bisexual or Queer','Others',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Asexual', 'Others',inplace=True)
df['Sexuality'].replace('Asexual', 'Others',inplace=True)

df['Sexuality'].replace('Gay or Lesbian;Bisexual or Queer', 'Gay',inplace=True)
df['Sexuality'].replace('Bisexual or Queer;Asexual', 'Others',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Gay or Lesbian;Bisexual or Queer;Asexual', 'Others',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Gay or Lesbian;Bisexual or Queer', 'Others',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Gay or Lesbian','Others',inplace=True)
df['Sexuality'].replace('Straight or heterosexual;Bisexual or Queer;Asexual','Others',inplace=True)       ,
df['Sexuality'].replace('Gay or Lesbian;Asexual','Others',inplace=True)
df['Sexuality'].replace('Gay or Lesbian;Bisexual or Queer;Asexual','Others',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual', 'Bisexual',inplace=True) 
df['Sexuality'].replace('Bisexual;Straight / Heterosexual', 'Others',inplace=True)
df['Sexuality'].replace('Bisexual;Gay or Lesbian', 'Gay',inplace=True)

df['Sexuality'].replace('Bisexual;Gay or Lesbian;Straight / Heterosexual','Others',inplace=True) 
df['Sexuality'].replace('Gay or Lesbian;Straight / Heterosexual','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Bisexual;Queer','Gay',inplace=True) 


df['Sexuality'].replace('Gay or Lesbian;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Queer','Gay',inplace=True) 
df['Sexuality'].replace('Bisexual;Gay or Lesbian;Straight / Heterosexual;Queer','Others',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Queer','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Gay or Lesbian;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Bisexual;Straight / Heterosexual;Queer','Others',inplace=True) 

df['Sexuality'].replace('Straight / Heterosexual;Bisexual;Gay or Lesbian;Queer','Others',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Prefer to self-describe:','Straight',inplace=True) 
df['Sexuality'].replace('Prefer to self-describe:','Prefer not to say',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Bisexual','Bisexual',inplace=True) 

df['Sexuality'].replace('Prefer to self-describe:;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Bisexual;Gay or Lesbian','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Prefer to self-describe:','Bisexual',inplace=True) 
df['Sexuality'].replace('Bisexual;Prefer to self-describe:;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Gay or Lesbian','Others',inplace=True) 


df['Sexuality'].replace('Bisexual;Prefer to self-describe:;Gay or Lesbian;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Bisexual;Queer','Others',inplace=True) 
df['Sexuality'].replace('Prefer to self-describe:;Gay or Lesbian;Queer','Gay',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Bisexual;Prefer to self-describe:;Gay or Lesbian;Queer','Others',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Bisexual;Prefer to self-describe:','Bisexual',inplace=True) 


df['Sexuality'].replace('Prefer to self-describe:;Gay or Lesbian','Gay',inplace=True) 

df['Sexuality'].replace('Straight / Heterosexual;Prefer to self-describe:;Queer','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Prefer to self-describe:;Gay or Lesbian','Gay',inplace=True) 
df['Sexuality'].replace('Bisexual;Straight / Heterosexual;Gay or Lesbian;Queer','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Straight / Heterosexual;Gay or Lesbian','Others',inplace=True) 

df['Sexuality'].replace('Bisexual;Straight / Heterosexual;Prefer to self-describe:;Gay or Lesbian;Queer','Others',inplace=True) 
df['Sexuality'].replace('Bisexual;Straight / Heterosexual;Prefer to self-describe:','Others',inplace=True) 
df['Sexuality'].replace('Straight / Heterosexual;Prefer to self-describe:;Gay or Lesbian','Others',inplace=True) 



In [8]:
df['Sexuality'].fillna('Prefer not to say', inplace=True)
df['Sexuality'].unique()

array(['Straight', 'Prefer not to say', 'Bisexual', 'Gay', 'Others'],
      dtype=object)

In [9]:
# I dont want to clean up this column T_T
kk = df['Ethnicity'].unique()
for i,k in enumerate(kk):
    print(k)
    if i>=15: break

White or of European descent
nan
Black or of African descent;East Asian;Hispanic or Latino/Latina;Middle Eastern;Native American, Pacific Islander, or Indigenous Australian;South Asian;White or of European descent
South Asian
Hispanic or Latino/Latina
East Asian
Hispanic or Latino/Latina;White or of European descent
Black or of African descent
East Asian;White or of European descent
South Asian;White or of European descent
East Asian;South Asian
Middle Eastern
Native American, Pacific Islander, or Indigenous Australian
Middle Eastern;White or of European descent
Native American, Pacific Islander, or Indigenous Australian;White or of European descent
Black or of African descent;White or of European descent


In [10]:
df['EdLevel'].unique()

array(['Bachelor’s degree (BA, BS, B.Eng., etc.)',
       'Some college/university study without earning a degree',
       'Master’s degree (MA, MS, M.Eng., MBA, etc.)',
       'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
       'Associate degree', nan, 'Primary/elementary school',
       'Professional degree (JD, MD, etc.)',
       'Other doctoral degree (Ph.D, Ed.D., etc.)',
       'I never completed any formal education',
       'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
       'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
       'Associate degree (A.A., A.S., etc.)',
       'Other doctoral degree (Ph.D., Ed.D., etc.)', 'Something else'],
      dtype=object)

In [11]:
df.columns

Index(['Age', 'Gender', 'Sexuality', 'Ethnicity', 'EdLevel', 'CompanySize',
       'Country', 'Employment', 'DevType(Position)', 'Currency',
       'CurrencySymbol', 'Salary', 'SalaryFreq', 'DollarizedSalary',
       'YearsCode', 'YearsCodePro', 'OperatingSystem',
       'DatabaseWantToWorkWith', 'DatabaseHaveWorkedWith',
       'FrameworkWantToWorkWith', 'FrameworkHaveWorkedWith',
       'LanguageWantToWorkWith', 'LanguageHaveWorkedWith',
       'PlatformWantToWorkWith', 'PlatformHaveWorkedWith', 'SurveyYear'],
      dtype='object')

---
### Get the dev, job and exp dataframes

---

In [12]:
dev_df = df[['Age', 'Gender', 'Sexuality', 'Ethnicity', 'EdLevel','SurveyYear']]
job_df = df[['CompanySize', 'Country', 'Employment', 'Currency',
        'CurrencySymbol', 'Salary', 'SalaryFreq', 'DollarizedSalary']]
exp_df = df[['YearsCode', 'YearsCodePro', 'OperatingSystem']]

---
### Add some extra columns (ids for primary key, and future foreign keys)

---

In [13]:
n_rows = df.shape[0]
id = np.arange(0, n_rows)

In [14]:
dev_df.insert(0, 'id_dev', dev_df['SurveyYear']*1000000+id, True)
dev_df.insert(7, 'id_job', id, True)
dev_df.insert(8, 'id_exp', id, True)

job_df.insert(0, 'id_job', id, True)
job_df.insert(9, 'id_dev_type', id, True)

exp_df.insert(0, 'id_exp', id, True)


In [15]:
job_df.head()

Unnamed: 0,id_job,CompanySize,Country,Employment,Currency,CurrencySymbol,Salary,SalaryFreq,DollarizedSalary,id_dev_type
0,0,"10,000 or more employees",UK,Employed full-time,British Sterlings,GBP,51000.0,Yearly_2,70841.0,0
1,1,100 to 499 employees,USA,Employed full-time,Dollars,USD,,Yearly_2,,1
2,2,10 to 19 employees,UK,Employed full-time,British Sterlings,GBP,30000.0,Yearly_2,41671.0,2
3,3,"10,000 or more employees",USA,Employed full-time,Dollars,USD,120000.0,Yearly_2,120000.0,3
4,4,100 to 499 employees,USA,Employed full-time,Dollars,USD,250000.0,Yearly_2,250000.0,4


---
### Get unique datasets

---

In [16]:
def col_with_semicolon_to_list_of_unique(df, col):
    """
        Transforms a column of data separated by semicolons into a list with the 
        unique values of the entire column.

        Input:
            df: Dataframe, Dataframe containing desired column
            col: String, name of the column

        Output:
            unique_values_list: list, list of unique values across the entire column
    """
    df_raw = df[col].copy()
    df_raw.dropna(inplace=True)
    df_raw = df_raw.unique().tolist()
    list_with_duplicates = []

    for x in df_raw:
        splitted = x.split(';')
        list_with_duplicates.append(splitted)

    # Plain data with comprehension list
    plain_list = [data for sublist in list_with_duplicates for data in sublist]

    # Unique values
    unique_values_set = set(plain_list)

    # Unique values list
    unique_values_list = list(unique_values_set)

    return unique_values_list

In [17]:
df.columns

Index(['Age', 'Gender', 'Sexuality', 'Ethnicity', 'EdLevel', 'CompanySize',
       'Country', 'Employment', 'DevType(Position)', 'Currency',
       'CurrencySymbol', 'Salary', 'SalaryFreq', 'DollarizedSalary',
       'YearsCode', 'YearsCodePro', 'OperatingSystem',
       'DatabaseWantToWorkWith', 'DatabaseHaveWorkedWith',
       'FrameworkWantToWorkWith', 'FrameworkHaveWorkedWith',
       'LanguageWantToWorkWith', 'LanguageHaveWorkedWith',
       'PlatformWantToWorkWith', 'PlatformHaveWorkedWith', 'SurveyYear'],
      dtype='object')

In [18]:
# get list of unique ocurrences

unique_lang = col_with_semicolon_to_list_of_unique(df, 'LanguageHaveWorkedWith')
unique_db = col_with_semicolon_to_list_of_unique(df, 'DatabaseHaveWorkedWith')
unique_framework = col_with_semicolon_to_list_of_unique(df, 'FrameworkHaveWorkedWith')
unique_platform = col_with_semicolon_to_list_of_unique(df, 'PlatformHaveWorkedWith')
unique_dev_type = col_with_semicolon_to_list_of_unique(df, 'DevType(Position)')

# Get number of ocurrences
lang_len = len(unique_lang)
db_len = len(unique_db)
framework_len = len(unique_framework)
platform_len = len(unique_platform)
dev_type_len = len(unique_dev_type)

# Create ids for each new dataframe
lang_idx = np.arange(0, lang_len)
db_idx = np.arange(0, db_len)
framework_idx = np.arange(0, framework_len)
platform_idx = np.arange(0, platform_len)
dev_type_idx = np.arange(0, dev_type_len)

# Create a new Dataframe for each 
lang_df = pd.DataFrame({'id_lang':lang_idx, 'language':unique_lang})
db_df = pd.DataFrame({'id_db':db_idx, 'database':unique_db})
framework_df = pd.DataFrame({'id_framework':framework_idx, 'framework':unique_framework})
platform_df = pd.DataFrame({'id_platform':platform_idx, 'platform':unique_platform})
dev_type_df = pd.DataFrame({'id_dev_type':dev_type_idx, 'dev_type':unique_dev_type})

In [19]:
lang_df

Unnamed: 0,id_lang,language
0,0,Hack
1,1,Erlang
2,2,Assembly
3,3,Groovy
4,4,Go
5,5,C++
6,6,PowerShell
7,7,COBOL
8,8,Solidity
9,9,Java


---
### Get dict for each table

---

In [20]:
inv_lang_dict = lang_df['language'].to_dict()
inv_db_dict = db_df['database'].to_dict()
inv_framework_dict = framework_df['framework'].to_dict()
inv_platform_dict = platform_df['platform'].to_dict()
inv_dev_type_dict = dev_type_df['dev_type'].to_dict()

In [21]:
inv_dev_type_dict

{0: 'Mobile developer',
 1: 'Marketing or sales professional',
 2: 'C-suite executive (CEO, CTO, etc.)',
 3: 'Engineering manager',
 4: 'Security professional',
 5: 'Developer, game or graphics',
 6: 'Developer, front-end',
 7: 'Cloud infrastructure engineer',
 8: 'Developer, back-end',
 9: 'Designer',
 10: 'Developer, full-stack',
 11: 'Product manager',
 12: 'Embedded applications or devices developer',
 13: 'Scientist',
 14: 'Project manager',
 15: 'Developer, QA or test',
 16: 'Educator or academic researcher',
 17: 'Desktop or enterprise applications developer',
 18: 'Other (please specify):',
 19: 'System administrator',
 20: 'Engineer, site reliability',
 21: 'Engineer, data',
 22: 'Data or business analyst',
 23: 'Blockchain',
 24: 'QA or test developer',
 25: 'Academic researcher',
 26: 'Senior Executive (C-Suite, VP, etc.)',
 27: 'Developer, desktop or enterprise applications',
 28: 'Full-stack developer',
 29: 'Educator',
 30: 'Senior executive/VP',
 31: 'Back-end developer'

In [22]:
lang_dict = {v:k for k, v in inv_lang_dict.items()}
db_dict = {v:k for k, v in inv_db_dict.items()}
framework_dict = {v:k for k, v in inv_framework_dict.items()}
platform_dict = {v:k for k, v in inv_platform_dict.items()}
dev_type_dict = {v:k for k, v in inv_dev_type_dict.items()}

In [23]:
lang_dict

{'Hack': 0,
 'Erlang': 1,
 'Assembly': 2,
 'Groovy': 3,
 'Go': 4,
 'C++': 5,
 'PowerShell': 6,
 'COBOL': 7,
 'Solidity': 8,
 'Java': 9,
 'R': 10,
 'Fortran': 11,
 'VB.NET': 12,
 'Bash/Shell/PowerShell': 13,
 'Ruby': 14,
 'Julia': 15,
 'OCaml': 16,
 'CSS': 17,
 'Ocaml': 18,
 'Crystal': 19,
 'Cobol': 20,
 'CoffeeScript': 21,
 'JavaScript': 22,
 'HTML': 23,
 'Bash/Shell': 24,
 'HTML/CSS': 25,
 'MATLAB': 26,
 'PHP': 27,
 'Rust': 28,
 'SAS': 29,
 'C#': 30,
 'Dart': 31,
 'Node.js': 32,
 'Lua': 33,
 'TypeScript': 34,
 'Swift': 35,
 'Other(s):': 36,
 'WebAssembly': 37,
 'Visual Basic 6': 38,
 'APL': 39,
 'Delphi/Object Pascal': 40,
 'Perl': 41,
 'Clojure': 42,
 'Elixir': 43,
 'Objective-C': 44,
 'Haskell': 45,
 'Scala': 46,
 'SQL': 47,
 'Python': 48,
 'Kotlin': 49,
 'Delphi': 50,
 'C': 51,
 'VBA': 52,
 'LISP': 53,
 'F#': 54,
 'Matlab': 55}

---
### Intermediate tables

---

In [24]:
# Language
df_exp_lang = df.assign(LanguageHaveWorkedWith=df['LanguageHaveWorkedWith'].str.split(';')).explode('LanguageHaveWorkedWith')
df_exp_lang = df_exp_lang.reset_index().rename(columns={'index':'id'})
df_exp_lang = df_exp_lang[['id','LanguageHaveWorkedWith']].drop_duplicates().reset_index(drop=True).sort_values('id')
df_exp_lang = df_exp_lang.rename(columns={'LanguageHaveWorkedWith':'language'})
df_exp_lang['id_lang'] = df_exp_lang['language'].map(lang_dict)



In [25]:
# Database
df_exp_db = df.assign(DatabaseHaveWorkedWith=df['DatabaseHaveWorkedWith'].str.split(';')).explode('DatabaseHaveWorkedWith')
df_exp_db = df_exp_db.reset_index().rename(columns={'index':'id'})
df_exp_db = df_exp_db[['id','DatabaseHaveWorkedWith']].drop_duplicates().reset_index(drop=True).sort_values('id')
df_exp_db = df_exp_db.rename(columns={'DatabaseHaveWorkedWith':'database'})
df_exp_db['id_database'] = df_exp_db['database'].map(db_dict)

In [26]:
# Framework
df_exp_framework = df.assign(FrameworkHaveWorkedWith=df['FrameworkHaveWorkedWith'].str.split(';')).explode('FrameworkHaveWorkedWith')
df_exp_framework = df_exp_framework.reset_index().rename(columns={'index':'id'})
df_exp_framework = df_exp_framework[['id','FrameworkHaveWorkedWith']].drop_duplicates().reset_index(drop=True).sort_values('id')
df_exp_framework = df_exp_framework.rename(columns={'FrameworkHaveWorkedWith':'framework'})
df_exp_framework['id_framework'] = df_exp_framework['framework'].map(framework_dict)

In [27]:
# Platform
df_exp_platform = df.assign(PlatformHaveWorkedWith=df['PlatformHaveWorkedWith'].str.split(';')).explode('PlatformHaveWorkedWith')
df_exp_platform = df_exp_platform.reset_index().rename(columns={'index':'id'})
df_exp_platform = df_exp_platform[['id','PlatformHaveWorkedWith']].drop_duplicates().reset_index(drop=True).sort_values('id')
df_exp_platform = df_exp_platform.rename(columns={'PlatformHaveWorkedWith':'platform'})
df_exp_platform['id_platform'] = df_exp_platform['platform'].map(platform_dict)

In [28]:
# DevType(Position) --> preprocessing
df.rename(columns={'DevType(Position)':'dev_type'}, inplace=True)

In [29]:
# dev_type

df_job_devtype = df.assign(dev_type=df['dev_type'].str.split(';')).explode('dev_type')
df_job_devtype = df_job_devtype.reset_index().rename(columns={'index':'id'})
df_job_devtype = df_job_devtype[['id','dev_type']].drop_duplicates().reset_index(drop=True).sort_values('id')
df_job_devtype['id_dev_type'] = df_job_devtype['dev_type'].map(dev_type_dict)



In [30]:
df_job_devtype.head()

Unnamed: 0,id,dev_type,id_dev_type
0,0,Database administrator,38.0
1,0,DevOps specialist,33.0
2,0,Full-stack developer,28.0
3,0,System administrator,19.0
4,1,Full-stack developer,28.0


---
### Drop duplicated data from intermediate tables, and rename ids

---

#### Experience-Language

---

In [31]:
df_exp_lang.drop(columns=['language'], inplace=True)
df_exp_lang.rename(columns={'id':'id_exp'}, inplace=True)
df_exp_lang.reset_index(inplace=True, drop=False)
df_exp_lang.rename(columns={'index':'id_exp_lang'}, inplace=True)
df_exp_lang.head()

Unnamed: 0,id_exp_lang,id_exp,id_lang
0,0,0,22.0
1,1,0,48.0
2,2,0,24.0
3,3,1,30.0
4,4,1,22.0


---
#### experience-db

---

In [32]:
df_exp_db.drop(columns=['database'], inplace=True)
df_exp_db.reset_index(inplace=True, drop=False)
df_exp_db.rename(columns={'id':'id_exp', 'index':'id_exp_db'}, inplace=True)
df_exp_db.head()

Unnamed: 0,id_exp_db,id_exp,id_database
0,0,0,6.0
1,1,0,25.0
2,2,0,27.0
3,3,1,5.0
4,4,1,9.0


---
#### experience-framework

---

In [33]:
df_exp_framework.drop(columns=['framework'], inplace=True)
df_exp_framework.reset_index(inplace=True, drop=False)
df_exp_framework.rename(columns={'id':'id_exp', 'index':'id_exp_framework'}, inplace=True)
df_exp_framework.head()

Unnamed: 0,id_exp_framework,id_exp,id_framework
0,0,0,26.0
1,1,1,
2,2,2,2.0
3,3,2,18.0
4,4,3,18.0


---
#### experience-platform

---

In [34]:
df_exp_platform.drop(columns=['platform'], inplace=True)
df_exp_platform.reset_index(inplace=True, drop=False)
df_exp_platform.rename(columns={'id':'id_exp', 'index':'id_exp_platform'}, inplace=True)
df_exp_platform.head()

Unnamed: 0,id_exp_platform,id_exp,id_platform
0,0,0,9.0
1,1,1,35.0
2,2,2,9.0
3,3,3,9.0
4,4,4,27.0


---
#### job-dev_type

---

In [35]:
df_job_devtype.drop(columns=['dev_type'], inplace=True)
df_job_devtype.reset_index(inplace=True, drop=False)
df_job_devtype.rename(columns={'id':'id_job', 'index':'id_job_devtype'}, inplace=True)
df_job_devtype.head()

Unnamed: 0,id_job_devtype,id_job,id_dev_type
0,0,0,38.0
1,1,0,33.0
2,2,0,28.0
3,3,0,19.0
4,4,1,28.0


---
## Save all the tables

these tables will be added to gitignore because they are too big

---

In [39]:
job_df.drop(columns=['id_dev_type'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  job_df.drop(columns=['id_dev_type'], inplace=True)


In [41]:
job_df.head(1)

Unnamed: 0,id_job,CompanySize,Country,Employment,Currency,CurrencySymbol,Salary,SalaryFreq,DollarizedSalary
0,0,"10,000 or more employees",UK,Employed full-time,British Sterlings,GBP,51000.0,Yearly_2,70841.0


In [36]:
# fact table (master table)
dev_df.to_csv('../data/final_tables/developer.csv', index=False)

# dimension tables
job_df.to_csv('../data/final_tables/job.csv', index=False)
exp_df.to_csv('../data/final_tables/experience.csv', index=False)

# lv2 dimension tables
lang_df.to_csv('../data/final_tables/language.csv', index=False)
db_df.to_csv('../data/final_tables/database.csv', index=False)
framework_df.to_csv('../data/final_tables/framework.csv', index=False)
platform_df.to_csv('../data/final_tables/platform.csv', index=False)
dev_type_df.to_csv('../data/final_tables/devtype.csv', index=False)

# intermediate tables
df_exp_lang.to_csv('../data/final_tables/exp_lang.csv', index=False)
df_exp_db.to_csv('../data/final_tables/exp_db.csv', index=False)
df_exp_framework.to_csv('../data/final_tables/exp_framework.csv', index=False)
df_exp_platform.to_csv('../data/final_tables/exp_platform.csv', index=False)
df_job_devtype.to_csv('../data/final_tables/job_devtype.csv', index=False)