In [1]:
import pandas as pd
import numpy as np
import country_converter as coco
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import nltk

%matplotlib inline

# Component: Raw Data Source

## Step 1. Select
From a variety of Data sources, we had to decide which raw data to use that can fit our interest and choose one data set to explore about the content. Finally we chose the salary data for 2023, we can study a more recent trend about the labor market which helps us to understand our strength.

In [13]:
df = pd.read_csv('../data/ds_salaries.csv')

## Step 2. Identify
Identify what types of data that we are using and its applicability. In our data, we have numbers, integers, decimals and texts which are arranged by different columns to assist us to have a more clear view of the data that we can use to answer the questions we originally defined.

- **Identitfy what types of data we are using**
- **Give a more clear view by filling more complete information**

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [15]:
df['experience_level'].value_counts()

experience_level
SE    2516
MI     805
EN     320
EX     114
Name: count, dtype: int64

In [22]:
employment_type = 'employment_type'
df[employment_type] = df[employment_type].replace('FT','Full-Time')
df[employment_type] = df[employment_type].replace('CT','Contract')
df[employment_type] = df[employment_type].replace('PT','Part-Time')
df[employment_type] = df[employment_type].replace('FL','Freelance')
df[employment_type].value_counts()

employment_type
Full-Time    3718
Part-Time      17
Contract       10
Freelance      10
Name: count, dtype: int64

In [21]:
experience_level = 'experience_level'
df[experience_level] = df[experience_level].replace('EN','Entry-level/Junior')
df[experience_level] = df[experience_level].replace('MI','Mid-level/Intermediate')
df[experience_level] = df[experience_level].replace('SE','Senior-level/Expert')
df[experience_level] = df[experience_level].replace('EX','Executive-level/Director')
df[experience_level].value_counts()

experience_level
Senior-level/Expert         2516
Mid-level/Intermediate       805
Entry-level/Junior           320
Executive-level/Director     114
Name: count, dtype: int64

## Step 3. Check 
Considering how to keep the data and its functions to use. We considered putting data in github to store our data in a structured way and easy to reach, to import and understand. 
- **We put the data on github for single source of truth and easy to access.**
- **Check the data of row is complete, and no null value.**

## Step 4. Access
Ensure the access of data and the sources suit the problem we defined at the first beginning. Here, when we were processing data, we were checking if the model can be run smoothly.
- **We can access the data smoothly and the data is in a good format to use.**

In [5]:
print(df.shape)
df.head()

(3755, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [6]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

## Step 5. Extract
Pick up the relevant data to use and ensure the data can be accessed without problem.
* **We select the necessary columns to use and drop the columns that are not useful for our analysis.**

In [6]:
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,US,100,US,L


# Use Case 1 - Average Salary based on Company Location

## Component: Explorative Data Analysis

### Step 6. Select
After extracting the most important data, we have already selected those that can be analyzed and to dive into different characteristics about the data, or our employment data to build preparation strategies or to make sure the validity of the result.
- **We selected the use case that we are interested in for use case 1.**
- **We output the necessary data, and saving to the github.**

In [25]:
selected_columns = ['employee_residence', 'company_location', 'salary_in_usd']
uc1 = df[selected_columns]


# output data to csv for use case 1 
uc1.to_csv('../data/uc1.csv', index=False)
uc1

Unnamed: 0,employee_residence,company_location,salary_in_usd
0,ESP,ESP,85847
1,USA,USA,30000
2,USA,USA,25500
3,CAN,CAN,175000
4,CAN,CAN,120000
...,...,...,...
3750,USA,USA,412000
3751,USA,USA,151000
3752,USA,USA,105000
3753,USA,USA,100000


### Step 7. Identity
To identify which kind of data that we can use also helps ourselves to prepare the data to be analyzed. After we choose the columns and data characteristics which fit to use for our problem to be solved or questions to be answered, we can prepare the analysis better than directly diving into it.

- **Access the data from Data Management, which is from github.**
- **We identify the country code by country_converter libaray.**
- **Printout the average salary by company location.**


In [26]:
uc1 = pd.read_csv('../data/uc1.csv')

In [27]:
country = coco.convert(names = df['employee_residence'], to = "ISO3")
uc1['employee_residence'] = country
uc1['company_location'] = country
uc1['employee_residence'].value_counts()

employee_residence
USA    3004
GBR     167
CAN      85
ESP      80
IND      71
       ... 
BIH       1
ARM       1
CYP       1
KWT       1
MLT       1
Name: count, Length: 78, dtype: int64

## Data Preparation

### Step 8. Check 
When arranging the data, the requirement should be checked. 

**Requirement**
1. No null value. 
2. Data can be used. 

In [28]:
uc1.isnull().sum()

employee_residence    0
company_location      0
salary_in_usd         0
dtype: int64

In [29]:
uc1

Unnamed: 0,employee_residence,company_location,salary_in_usd
0,ESP,ESP,85847
1,USA,USA,30000
2,USA,USA,25500
3,CAN,CAN,175000
4,CAN,CAN,120000
...,...,...,...
3750,USA,USA,412000
3751,USA,USA,151000
3752,USA,USA,105000
3753,USA,USA,100000


# Use Case 2

## Component: Expolative Data Analysis

### Step 6.

In [36]:
# write your code

### Step 7.

In [None]:
# write your code

## Component: Expolative Data Analysis

### Step 8.

In [None]:
# write your code

# Component: Data Management

### Step 9. Structure
To check the infrastructure for our project to be run without problem, we assess our devices for the analysis before start. Either the computer, or other internet that we are using.
- **We check the infrastructure for our project to be run without problem.**
- **Make sure we can access the data from data management.**

In [32]:
import os
import psutil  # for checking memory usage

print("CPU usage:", psutil.cpu_percent(interval=1), "%")  # every 1 second cpu usage
print("memory usage:", psutil.virtual_memory())  # memory usage


CPU usage: 32.1 %
memory usage: svmem(total=17179869184, available=1239564288, percent=92.8, used=1801277440, free=13504512, active=1230143488, inactive=1217159168, wired=571133952)


In [33]:
import sys

print("Python version:", sys.version)
# check pandas version
import pandas as pd
print("Pandas version:", pd.__version__)


Python version: 3.12.0 | packaged by conda-forge | (main, Oct  3 2023, 08:43:38) [Clang 15.0.7 ]
Pandas version: 2.1.3


In [34]:
## access data from csv file
pd.read_csv('../data/uc1.csv')

Unnamed: 0,employee_residence,company_location,salary_in_usd
0,ESP,ESP,85847
1,USA,USA,30000
2,USA,USA,25500
3,CAN,CAN,175000
4,CAN,CAN,120000
...,...,...,...
3750,USA,USA,412000
3751,USA,USA,151000
3752,USA,USA,105000
3753,USA,USA,100000
