In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = load_dataset("lukebarousse/data_jobs")
df = data["train"].to_pandas()

df["job_posted_date"] = pd.to_datetime(df["job_posted_date"])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [2]:
# dataframe.loc[] let you mention columns as str or list instead index that you done in dataframe.iloc[]
# In fisrt example, we slice the column using thier name.

In [3]:
df.loc[:, "salary_rate" : "salary_hour_avg"].dropna(subset = ["salary_rate", "salary_year_avg"]).head(10)

#OR df.loc[:10, "salary_rate" : "salary_hour_avg"].dropna(subset = ["salary_rate", "salary_year_avg"])                                                                          

Unnamed: 0,salary_rate,salary_year_avg,salary_hour_avg
28,year,109500.0,
77,year,140000.0,
92,year,120000.0,
100,year,228222.0,
109,year,89000.0,
116,year,114000.0,
146,year,129500.0,
180,year,90250.0,
212,year,157500.0,
257,year,103128.0,


In [4]:
df.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."


In [5]:
# In next example, we mention the specific columns to be displayed.

In [6]:
df.loc[:][["job_title_short", "job_country", "salary_year_avg"]].dropna(subset = "salary_year_avg").sort_values(by = "salary_year_avg", ascending = False).head(10)

Unnamed: 0,job_title_short,job_country,salary_year_avg
554784,Data Scientist,United States,960000.0
665811,Senior Data Scientist,South Africa,890000.0
168402,Data Analyst,India,650000.0
387378,Data Scientist,United States,585000.0
160521,Data Scientist,Sudan,550000.0
554140,Data Scientist,United States,525000.0
404932,Data Engineer,United States,525000.0
417241,Senior Data Scientist,United States,475000.0
618461,Senior Data Scientist,United States,463500.0
124265,Data Scientist,United States,450000.0


In [7]:
# Syntax of dataframe.loc[] of pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html

In [8]:
# Now lets say we want to replace the NaN values of salary_year_avg with the median salary of that column.

In [9]:
salary_year_avg_median = df.salary_year_avg.median()
df["salary_year_avg"].fillna(salary_year_avg_median)


0         115000.0
1         115000.0
2         115000.0
3         115000.0
4         115000.0
            ...   
785736    115000.0
785737    115000.0
785738    115000.0
785739    115000.0
785740    115000.0
Name: salary_year_avg, Length: 785741, dtype: float64

In [10]:
# Syntax of .fillna() of pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html

In [11]:
# Now I want to if there is any duplicate entry in the dataframe. 

In [12]:
df.duplicated().any()

True

In [13]:
# syntax for .duplicated() of pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html


In [14]:
# Lets say I want to see that duplicates. For that, first I have to create the dataframe out of the above condition.

In [15]:
df_dup = df[df.duplicated()]
df_dup

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
16905,Data Scientist,Consultant - Data Science & Analytics,Hong Kong,via BeBee 香港,Full-time,False,Hong Kong,2023-02-08 21:47:18,False,False,Hong Kong,,,,Sia Partners,"['python', 'r', 'sql', 'tableau']","{'analyst_tools': ['tableau'], 'programming': ..."
44240,Data Scientist,Data Scientist,"Saint-Gilles, Belgium",via BeBee Belgique,Full-time,False,Belgium,2023-01-05 06:42:48,False,False,Belgium,,,,Smals,"['r', 'sas', 'sas', 'python']","{'analyst_tools': ['sas'], 'programming': ['r'..."
54343,Data Scientist,PSAS Science and Analytics,Canada,via Trabajo.org,Full-time,False,Canada,2023-01-15 06:16:22,False,False,Canada,,,,AbeBooks,"['java', 'python', 'r', 'sql', 'aws', 'redshif...","{'analyst_tools': ['tableau'], 'cloud': ['aws'..."
57012,Data Analyst,Data Analyst,"Brno, Czechia",via Trabajo.org,Full-time,False,Czechia,2023-04-15 06:43:57,True,False,Czechia,,,,FE fundinfo,,
59403,Data Scientist,Data Scientist,"Xico, Ver., Mexico",via Trabajo.org,Full-time,False,Mexico,2023-01-15 06:19:01,False,False,Mexico,,,,Kantar,"['python', 'r', 'go', 'git']","{'other': ['git'], 'programming': ['python', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737894,Senior Data Engineer,Senior Data Engineer,"Toronto, ON, Canada",via Trabajo.org,Full-time,False,Canada,2023-11-16 21:13:03,False,False,Canada,,,,Loblaw Companies,"['scala', 'sql', 'sql server', 'bigquery', 'az...","{'cloud': ['bigquery', 'azure', 'databricks', ..."
739459,Data Analyst,Data Analyst,"Lisbon, Portugal",via BeBee Portugal,Full-time,False,Portugal,2023-02-27 21:21:41,False,False,Portugal,,,,Siemens,,
750309,Data Engineer,Data Analytics & AI Engineer,"Zürich, Switzerland",via Jobeo,Full-time,False,Switzerland,2023-05-01 20:24:15,False,False,Switzerland,,,,Swisscom,"['python', 'sql', 'qlik']","{'analyst_tools': ['qlik'], 'programming': ['p..."
762489,Data Engineer,Data Engineer,Anywhere,via LinkedIn,Full-time,True,Argentina,2023-07-25 20:15:59,False,False,Argentina,,,,Baufest,"['sql', 'java', 'oracle']","{'cloud': ['oracle'], 'programming': ['sql', '..."


In [16]:
# Now I want  to get rid of these duplicate, for that

In [17]:
df_cleaned = df.drop_duplicates()
df_cleaned

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785736,Software Engineer,DevOps Engineer,Singapura,melalui Trabajo.org,Pekerjaan tetap,False,Singapore,2023-03-13 06:16:16,False,False,Singapore,,,,CAREERSTAR INTERNATIONAL PTE. LTD.,"['bash', 'python', 'perl', 'linux', 'unix', 'k...","{'os': ['linux', 'unix'], 'other': ['kubernete..."
785737,Data Analyst,CRM Data Analyst,"Bad Rodach, Jerman",melalui BeBee Deutschland,Pekerjaan tetap,False,Germany,2023-03-12 06:18:18,False,False,Germany,,,,HABA FAMILYGROUP,"['sas', 'sas', 'sql', 'excel']","{'analyst_tools': ['sas', 'excel'], 'programmi..."
785738,Business Analyst,Commercial Analyst - Start Now,Malaysia,melalui Ricebowl,Pekerjaan tetap,False,Malaysia,2023-03-12 06:32:36,False,False,Malaysia,,,,Lendlease Corporation,"['powerpoint', 'excel']","{'analyst_tools': ['powerpoint', 'excel']}"
785739,Data Engineer,"Principal Associate, Data Engineer (Remote-Eli...","Newark, New Jersey, Amerika Serikat",melalui Recruit.net,Pekerjaan tetap,False,Sudan,2023-03-12 06:32:15,False,False,Sudan,,,,Capital One,"['python', 'go', 'nosql', 'sql', 'mongo', 'she...","{'cloud': ['aws', 'snowflake', 'azure', 'redsh..."


In [18]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 785640 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785640 non-null  object        
 1   job_title              785639 non-null  object        
 2   job_location           784595 non-null  object        
 3   job_via                785632 non-null  object        
 4   job_schedule_type      772975 non-null  object        
 5   job_work_from_home     785640 non-null  bool          
 6   search_location        785640 non-null  object        
 7   job_posted_date        785640 non-null  datetime64[ns]
 8   job_no_degree_mention  785640 non-null  bool          
 9   job_health_insurance   785640 non-null  bool          
 10  job_country            785591 non-null  object        
 11  salary_rate            33066 non-null   object        
 12  salary_year_avg        22002 non-null   float64  

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [20]:
# We can see there is reduction in the no of rows in df_cleaned and df. Thus df_cleaned is dataframe without duplicates.

In [21]:
# Syntax for .drop_duplicates() for pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html

In [22]:
# Now let say I want to check only centain no of columns for duplication
# Say for job_title and company_name. Do any same company have resistered same job title many times
# Remember, here we are using  and logic, that means if entry in job_title have exact values in many rows along with 
# entry in company_name have exact values in many rows, then and only it will return True.

In [23]:
df_cleaned.duplicated(subset = ["job_title", "company_name"]).any()

True

In [24]:
# Ohh oo, we have such remove this entries.

In [25]:
df_cleaned = df_cleaned.drop_duplicates(subset = ["job_title", "company_name"])
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 508042 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        508042 non-null  object        
 1   job_title              508041 non-null  object        
 2   job_location           507389 non-null  object        
 3   job_via                508042 non-null  object        
 4   job_schedule_type      499091 non-null  object        
 5   job_work_from_home     508042 non-null  bool          
 6   search_location        508042 non-null  object        
 7   job_posted_date        508042 non-null  datetime64[ns]
 8   job_no_degree_mention  508042 non-null  bool          
 9   job_health_insurance   508042 non-null  bool          
 10  job_country            507997 non-null  object        
 11  salary_rate            20620 non-null   object        
 12  salary_year_avg        13091 non-null   float64  

In [26]:
#Now if want a random sample of the dataframe

In [27]:
df.sample(10)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
182623,Data Engineer,REMOTE - Data Engineer / Developer (Python),"Costa Mesa, CA",via LinkedIn,Contractor,False,Sudan,2023-03-28 16:27:44,False,True,Sudan,,,,Platinum Resource Group,"['python', 'sql', 'nosql', 'postgresql', 'excel']","{'analyst_tools': ['excel'], 'databases': ['po..."
680439,Senior Data Engineer,Senior Data Engineer,"New Delhi, Delhi, India",via LinkedIn,Full-time,False,India,2023-05-05 10:30:23,False,False,India,,,,Perfect Job Consultancy,"['sql', 'java', 'python', 'aws', 'gcp', 'azure...","{'cloud': ['aws', 'gcp', 'azure'], 'libraries'..."
149732,Data Engineer,Data Engineer - Herts,"Hertfordshire, UK",via Leisure Jobs,Full-time,False,United Kingdom,2023-08-21 08:15:55,True,False,United Kingdom,,,,Haven Holidays,"['sql', 'snowflake', 'aws', 'redshift', 'azure...","{'cloud': ['snowflake', 'aws', 'redshift', 'az..."
747006,Data Analyst,Lead Data Analyst,Dubai - United Arab Emirates,via LinkedIn,Full-time,False,United Arab Emirates,2023-11-27 21:08:38,True,False,United Arab Emirates,,,,SGP Technology,"['sql', 'python', 'r', 'excel', 'tableau', 'po...","{'analyst_tools': ['excel', 'tableau', 'power ..."
327608,Senior Data Engineer,Senior BI Data Engineer,"Lisbon, Portugal",via BeBee Portugal,Full-time,False,Portugal,2023-06-06 23:30:50,False,False,Portugal,,,,ConvaTec,"['sql', 'mysql', 'sql server', 'oracle', 'ssis...","{'analyst_tools': ['ssis', 'tableau', 'power b..."
374494,Data Engineer,"Data Engineer, Mid","Bethesda, MD",via Adzuna,Full-time,False,"Florida, United States",2023-03-08 18:10:01,False,True,United States,,,,Booz Allen Hamilton,"['sql', 'nosql', 'elasticsearch', 'aws']","{'cloud': ['aws'], 'databases': ['elasticsearc..."
450679,Data Engineer,Data Engineer Lead,"Amsterdam, Netherlands",via LinkedIn,Full-time,False,Netherlands,2023-09-07 14:30:30,False,False,Netherlands,,,,ORTEC,"['go', 'azure', 'power bi', 'dax']","{'analyst_tools': ['power bi', 'dax'], 'cloud'..."
507844,Data Engineer,Sr. Data Engineer,"Toronto, ON, Canada",via BeBee Canada,Full-time,False,Canada,2023-01-02 11:30:20,False,False,Canada,,,,Diverse Lynx,"['python', 'sql', 'scala', 'pyspark']","{'libraries': ['pyspark'], 'programming': ['py..."
322379,Data Analyst,Data & Analytics Manager (Sustainable Energy,"Noord, Netherlands",via BeBee,Full-time,False,Netherlands,2023-07-23 17:29:43,False,False,Netherlands,,,,"Noord-Holland, Netherlands","['sql', 'python', 'bigquery', 'power bi']","{'analyst_tools': ['power bi'], 'cloud': ['big..."
776577,Data Engineer,Azure data Engineer,"Hyderabad, Telangana, India",via LinkedIn,Full-time,False,India,2023-01-19 06:14:13,True,False,India,,,,GITS RECRUITMENT PRIVATE LIMITED,"['sql', 'shell', 'powershell', 'azure', 'datab...","{'cloud': ['azure', 'databricks'], 'other': ['..."


In [28]:
# Say I wamt a same sample every time

In [29]:
df.sample(5 , random_state = 42)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
502901,Senior Data Engineer,Senior Data Engineer,"St Paul, MN",via BeBee,Full-time,False,"Florida, United States",2023-12-17 11:09:06,False,False,United States,,,,ManpowerGroup,"['sql', 'azure', 'git']","{'cloud': ['azure'], 'other': ['git'], 'progra..."
406337,Senior Data Scientist,Senior Analytics Engineer,"London, UK",via LinkedIn,Full-time,False,United Kingdom,2023-10-06 16:09:56,False,False,United Kingdom,,,,Harnham,"['sql', 'python', 'bigquery', 'snowflake', 'lo...","{'analyst_tools': ['looker'], 'cloud': ['bigqu..."
659951,Data Engineer,Data Engineer,"Newcastle upon Tyne, UK",via Indeed,Full-time,False,United Kingdom,2023-09-06 10:11:25,True,False,United Kingdom,,,,Morgan King,,
541593,Data Analyst,Data Analyst,Malta,via Trabajo.org,Full-time,False,Malta,2023-02-14 21:14:18,True,False,Malta,,,,Konnekt,['jira'],{'async': ['jira']}
425158,Senior Data Engineer,Senior Data Engineer. Job in Amsterdam NBC4i Jobs,"Amsterdam, Netherlands",via NBC4i Jobs,Full-time,False,Netherlands,2023-06-19 16:34:30,True,False,Netherlands,,,,Independent Recruiters,['python'],{'programming': ['python']}


In [30]:
df.sample(5 , random_state = 42)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
502901,Senior Data Engineer,Senior Data Engineer,"St Paul, MN",via BeBee,Full-time,False,"Florida, United States",2023-12-17 11:09:06,False,False,United States,,,,ManpowerGroup,"['sql', 'azure', 'git']","{'cloud': ['azure'], 'other': ['git'], 'progra..."
406337,Senior Data Scientist,Senior Analytics Engineer,"London, UK",via LinkedIn,Full-time,False,United Kingdom,2023-10-06 16:09:56,False,False,United Kingdom,,,,Harnham,"['sql', 'python', 'bigquery', 'snowflake', 'lo...","{'analyst_tools': ['looker'], 'cloud': ['bigqu..."
659951,Data Engineer,Data Engineer,"Newcastle upon Tyne, UK",via Indeed,Full-time,False,United Kingdom,2023-09-06 10:11:25,True,False,United Kingdom,,,,Morgan King,,
541593,Data Analyst,Data Analyst,Malta,via Trabajo.org,Full-time,False,Malta,2023-02-14 21:14:18,True,False,Malta,,,,Konnekt,['jira'],{'async': ['jira']}
425158,Senior Data Engineer,Senior Data Engineer. Job in Amsterdam NBC4i Jobs,"Amsterdam, Netherlands",via NBC4i Jobs,Full-time,False,Netherlands,2023-06-19 16:34:30,True,False,Netherlands,,,,Independent Recruiters,['python'],{'programming': ['python']}


In [31]:
# Syntax of dataframe.sample() of pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html

In [32]:
# If I want to make a copy of a dataframe

In [33]:
df_og = df.copy()

In [34]:
id(df) == id(df_og)

False

In [35]:
# See the ids of df and df_og datarame are different
# that means if I make changes in df it does not affect the df_og and vice versa
# If I assign with "=" then both variables point towords same dataframe which will reflect in having same id.
# Thus, changes made in one will reflect in the other 

In [36]:
df_link = df

In [37]:
id(df) == id(df_link)

True

In [38]:
# Syntax of .copy() for pandas
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html

In [39]:
# Lets understand the pivot table
# Here, the syntax : [.pivot_table()]
# https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html

# index represent row 
# columns represent column

In [40]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 508042 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        508042 non-null  object        
 1   job_title              508041 non-null  object        
 2   job_location           507389 non-null  object        
 3   job_via                508042 non-null  object        
 4   job_schedule_type      499091 non-null  object        
 5   job_work_from_home     508042 non-null  bool          
 6   search_location        508042 non-null  object        
 7   job_posted_date        508042 non-null  datetime64[ns]
 8   job_no_degree_mention  508042 non-null  bool          
 9   job_health_insurance   508042 non-null  bool          
 10  job_country            507997 non-null  object        
 11  salary_rate            20620 non-null   object        
 12  salary_year_avg        13091 non-null   float64  

In [41]:
# Let say I want a table representing average salary of different job title of individual companies
# Take job_title_short, company_name, and salary_year_avg

In [42]:
df_cleaned = df_cleaned.dropna(subset = "salary_year_avg")

In [43]:
df_cleaned["salary_year_avg"].isna().any()

False

In [44]:
df_cleaned.pivot_table(index = "company_name", columns = "job_title_short", values = "salary_year_avg", aggfunc = "mean")

job_title_short,Business Analyst,Cloud Engineer,Data Analyst,Data Engineer,Data Scientist,Machine Learning Engineer,Senior Data Analyst,Senior Data Engineer,Senior Data Scientist,Software Engineer
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
#twiceasnice Recruiting,,,63666.666667,,,,120000.0,,,
/dev/color,,,,,125000.0,,,,,
0nward Select,,,92500.000000,,,,,,,
1 Point System LLC.,,,,170000.0,,,,,,
"1-800-FLOWERS.COM, INC.",,,,,,,,,190000.0,
...,...,...,...,...,...,...,...,...,...,...
zooplus SE,,,,,,166000.0,,,,
ztp,,,,95000.0,,,,,,
ЛАНИТ,,,400000.000000,,,,,,,
Технологическая компания,,280000.0,,,,,,,,
