In [55]:
import pandas as pd
import numpy as np
import os

df_path = os.path.join(os.getcwd(), '..', 'data', 'ds_salaries.csv')

df = pd.read_csv(df_path, index_col=0)

In [56]:
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...
602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


# Data Analysis

##  Opportunities for Data Scientists to move to other countries whilst maintaining their same cost of living

In [57]:
# What are the countries present in the dataset?

df['employee_residence'].unique()

array(['DE', 'JP', 'GB', 'HN', 'US', 'HU', 'NZ', 'FR', 'IN', 'PK', 'PL',
       'PT', 'CN', 'GR', 'AE', 'NL', 'MX', 'CA', 'AT', 'NG', 'PH', 'ES',
       'DK', 'RU', 'IT', 'HR', 'BG', 'SG', 'BR', 'IQ', 'VN', 'BE', 'UA',
       'MT', 'CL', 'RO', 'IR', 'CO', 'MD', 'KE', 'SI', 'HK', 'TR', 'RS',
       'PR', 'LU', 'JE', 'CZ', 'AR', 'DZ', 'TN', 'MY', 'EE', 'AU', 'BO',
       'IE', 'CH'], dtype=object)

In [59]:
import requests
import bs4
from bs4 import BeautifulSoup

class Preprocessor:
    def __init__(self):
        self.is_fit = False

    def fit(self, df):
        self.iso_cc = Preprocessor.get_iso_codes()
        self.is_fit = True
    
    def preprocess(self, df):
        if not self.is_fit:
            raise Exception("Preprocessor Needs to be Fit!")
        df = map_country_codes_to_names(df, self.iso_cc)
        

    @staticmethod
    def get_iso_codes():
        '''
        Gets the ISO Country Codes

        Used in Fit only
        '''
        r = requests.get('https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes').text
        soup = BeautifulSoup(r, 'html.parser')
        table = soup.findAll('table', class_ = 'wikitable sortable')
        iso_cc_df = pd.read_html(str(table[0]), header=None)[0]
        iso_cc_df = iso_cc_df.T.reset_index().drop(columns=['level_0']).iloc[[0, 3]].T.reset_index(drop=True).rename(columns={0: "Country Name", 3: "Code"}).iloc[1:, :]
        iso_cc_df.loc[1, :]['Code'] = "AF"
        iso_cc_df.dropna(inplace=True)
        to_drop = iso_cc_df[iso_cc_df['Code'].str.contains("See")].index
        iso_cc_df.drop(to_drop, inplace=True)
        iso_cc_df = iso_cc_df.reset_index(drop=True)
        return iso_cc_df
        
    @staticmethod
    def map_country_codes_to_names(df, cc_df):
        '''
        
        '''
        df = df.merge(cc_df, how='left', left_on='company_location', right_on='Code')
        df = df.merge(cc_df, how='left', left_on='employee_residence', right_on='Code')
        df.drop(columns=['Code_x', 'Code_y'], inplace=True)
        return df

    def convert_remote_ratio(df):
        df['remote_ratio'] = df['remote_ratio'] // 50
        return df

    def 

    def clean_and_expand(df):
    
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Country Name_x,Code_x,Country Name_y,Code_y
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L,Germany,DE,Germany,DE
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S,Japan,JP,Japan,JP
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M,United Kingdom of Great Britain and Northern I...,GB,United Kingdom of Great Britain and Northern I...,GB
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S,Honduras,HN,Honduras,HN
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L,United States of America (the),US,United States of America (the),US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M,United States of America (the),US,United States of America (the),US
603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M,United States of America (the),US,United States of America (the),US
604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M,United States of America (the),US,United States of America (the),US
605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M,United States of America (the),US,United States of America (the),US
