# Pandas Data Management


In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [6]:
# DataFrame Copy
df_original = df.copy()
# The copy() method in pandas is used to create a deep copy of a DataFrame. 
# This means that any changes made to the new DataFrame will not affect the original DataFrame.

# Create new dataframe
df_altered = df_original

# Select the first 6 rows of salary_year_avg from df_altered 
df_altered.loc[:5,'salary_year_avg']

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: salary_year_avg, dtype: float64

In [7]:
# Calculating the median salary
median_salary = df_altered['salary_year_avg'].median()

# Filling the missing values with the median salary
df_altered['salary_year_avg'] = df_altered.loc[:,'salary_year_avg'].fillna(median_salary)

In [8]:
# Another way of filling the missing values with the median salary
df_altered['salary_year_avg'] = df_altered['salary_year_avg'].fillna(median_salary)

In [9]:
# Inspecting the altered DataFrame
df_altered.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [10]:
# Inspecting the Original DataFrame
df_original.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [11]:
print('ID of df_original:               ', id(df_original))
print('ID of df_altered:                ', id(df_altered))
print('Are the two dataframes the same? ', id(df_original) == id(df_altered))

ID of df_original:                1214685406992
ID of df_altered:                 1214685406992
Are the two dataframes the same?  True


In [12]:
df_original = df.copy()
df_altered = df_original.copy()

print('ID of df_original:               ', id(df_original))
print('ID of df_altered:                ', id(df_altered))
print('Are the two dataframes the same? ', id(df_original) == id(df_altered))

ID of df_original:                1214685842384
ID of df_altered:                 1214685850640
Are the two dataframes the same?  False


In [13]:
# Calculating the median salary
median_salary = df_altered['salary_year_avg'].median()

# Filling the missing values with the median salary
df_altered['salary_year_avg'] = df_altered['salary_year_avg'].fillna(median_salary)

df_altered.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [15]:
df_original.loc[:5,'salary_year_avg']

# See the difference

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: salary_year_avg, dtype: float64

In [17]:
# Sample(): Random sample of items
# Gets 5 random sample of the data


df.sample(n=5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
228112,Machine Learning Engineer,Machine Learning Engineer,"Bogotá, Bogota, Colombia",via BeBee,Full-time,False,Colombia,2023-07-22 07:30:13,False,False,Colombia,,,,Bold,"['python', 'dynamodb', 'aws', 'azure', 'gcp', ...","{'cloud': ['aws', 'azure', 'gcp'], 'databases'..."
538573,Software Engineer,BI Developer SAP BW4/HANA (m/f/x) (Inhouse),"Cologne, Germany (+1 other)",via StepStone,Full-time,False,Germany,2023-02-12 19:35:52,True,False,Germany,,,,Deloitte,['sap'],{'analyst_tools': ['sap']}
723235,Data Engineer,Data Engineer,"Mexico City, CDMX, Mexico",via BeBee México,Full-time,False,Mexico,2023-03-26 21:30:18,True,False,Mexico,,,,Connectingology,"['sql', 'sql server', 'mysql', 'power bi', 'ta...","{'analyst_tools': ['power bi', 'tableau'], 'da..."
141994,Data Engineer,Service Operations Specialist,Luxembourg,via Emplois Trabajo.org,Full-time,False,Luxembourg,2023-12-13 09:31:12,False,False,Luxembourg,,,,Goodyear,"['go', 'sap']","{'analyst_tools': ['sap'], 'programming': ['go']}"
610321,Senior Data Engineer,Senior Data Engineer,"New York, NY",via ZipRecruiter,Full-time,False,"Texas, United States",2023-12-10 12:07:59,False,False,United States,,,,Publicis Sapient,"['nosql', 'sql', 'python', 'java', 'javascript...","{'cloud': ['azure', 'aws', 'redshift', 'oracle..."


In [18]:
# Randomly select a fraction of the data (10% of the rows), with or without replacement.

df.sample(frac=0.1, replace=False)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
448453,Data Scientist,Principal Product Data Scientist - Client Foun...,"London, UK (+1 other)",via Hitmarker,Full-time,False,United Kingdom,2023-05-31 14:19:13,False,False,United Kingdom,,,,King,"['sql', 'excel', 'flow']","{'analyst_tools': ['excel'], 'other': ['flow']..."
528937,Software Engineer,Senior Desktop Support Engineer,"Docklands VIC, Australia",via Jobs Trabajo.org,Full-time,False,Australia,2023-04-18 11:12:56,True,False,Australia,,,,Equifax,['windows'],{'os': ['windows']}
517892,Senior Data Scientist,Senior data scientist / ML engineer,"Ghent, Belgium",via LinkedIn Belgium,Full-time,False,Belgium,2023-07-14 11:56:17,False,False,Belgium,,,,BioLizard,"['python', 'r']","{'programming': ['python', 'r']}"
584129,Data Engineer,Principal Data Engineer - Analytics Platform,"Amsterdam, Netherlands",via LinkedIn,Full-time,False,Netherlands,2023-10-12 10:12:53,False,False,Netherlands,,,,Booking.com,"['java', 'sql', 'python', 'snowflake', 'aws', ...","{'cloud': ['snowflake', 'aws'], 'libraries': [..."
203700,Data Scientist,Data Scientist & Analyst,Argentina,via LinkedIn,Full-time,False,Argentina,2023-11-14 22:11:57,False,False,Argentina,,,,NCR Atleos,"['sql', 'nosql', 'python', 'tableau']","{'analyst_tools': ['tableau'], 'programming': ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533034,Business Analyst,"Analyst, Master Planning",Hong Kong,via Trabajo.org,Full-time,False,Hong Kong,2023-01-08 19:40:59,False,False,Hong Kong,,,,Unicircuit,"['vba', 'word', 'excel', 'powerpoint']","{'analyst_tools': ['word', 'excel', 'powerpoin..."
503583,Data Engineer,Data Engineer. Job in Brussel My Valley Jobs T...,"Brussels, Belgium",via My Valley Jobs Today,Full-time,False,Belgium,2023-02-25 11:10:49,False,False,Belgium,,,,Datalumen,"['sas', 'sas']","{'analyst_tools': ['sas'], 'programming': ['sa..."
606435,Senior Data Scientist,Senior Data Scientist - Remote,Anywhere,via LinkedIn,Full-time,True,Poland,2023-10-14 09:15:22,False,False,Poland,,,,Akamai Technologies,"['python', 'bash', 'sql']","{'programming': ['python', 'bash', 'sql']}"
745297,Data Scientist,Consultant SME (Principal Data Scientist-Data ...,"Raleigh, NC",via Dice,Contractor,False,"New York, United States",2023-07-10 20:02:53,False,False,United States,,,,Lucid Technologies,"['python', 'perl', 'ruby', 'ruby', 'sas', 'sas...","{'analyst_tools': ['sas'], 'cloud': ['azure', ..."
