# Glassdoor Data Science (DS) Jobs; Data Refining

This project focuses on transforming raw, unprocessed DS jobs dataset into a more refined, accurate, and consistent format, making it suitable for analysis, modeling, or other applications.

### Importing Libraries

In [15]:
# Import neccessary libraries

import pandas as pd
import numpy as np
import re # for regex

In [16]:
# Load Dataset
url = "Uncleaned_DS_jobs.csv"
df = pd.read_csv(url,index_col= "index")

### Understanding the Data Structure

In [17]:
# First 5 rows
df.head()

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [18]:
# Last 5 rows
df.tail()

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
667,Data Scientist,$105K-$167K (Glassdoor est.),Summary\n\nWe’re looking for a data scientist ...,3.6,TRANZACT\n3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1
668,Data Scientist,$105K-$167K (Glassdoor est.),Job Description\nBecome a thought leader withi...,-1.0,JKGT,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
669,Data Scientist,$105K-$167K (Glassdoor est.),Join a thriving company that is changing the w...,-1.0,AccessHope,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1
670,Data Scientist,$105K-$167K (Glassdoor est.),100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated\n5.0,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1
671,Data Scientist,$105K-$167K (Glassdoor est.),Description\n\nThe Data Scientist will be part...,2.7,1-800-Flowers\n2.7,"New York, NY","Carle Place, NY",1001 to 5000 employees,1976,Company - Public,Wholesale,Business Services,$1 to $2 billion (USD),-1


In [19]:
# Dataset columns
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors'],
      dtype='object')

In [20]:
# Dimension of dataset
df.shape

(672, 14)

In [21]:
# Data information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, 0 to 671
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          672 non-null    object 
 1   Salary Estimate    672 non-null    object 
 2   Job Description    672 non-null    object 
 3   Rating             672 non-null    float64
 4   Company Name       672 non-null    object 
 5   Location           672 non-null    object 
 6   Headquarters       672 non-null    object 
 7   Size               672 non-null    object 
 8   Founded            672 non-null    int64  
 9   Type of ownership  672 non-null    object 
 10  Industry           672 non-null    object 
 11  Sector             672 non-null    object 
 12  Revenue            672 non-null    object 
 13  Competitors        672 non-null    object 
dtypes: float64(1), int64(1), object(12)
memory usage: 78.8+ KB


In [22]:
# Summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,672.0,3.518601,1.410329,-1.0,3.3,3.8,4.3,5.0
Founded,672.0,1635.529762,756.74664,-1.0,1917.75,1995.0,2009.0,2019.0


### Data Cleaning

#### Handling Missing Values

In [23]:
# Check for null values
df.isnull().sum()

Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
dtype: int64

There are no null values in the dataset.

#### Checking for duplicates

In [24]:
# Verifying if there are duplicates
df.duplicated().any()

True

As shown above, there are duplicated rows

In [25]:
# Total of duplicates
df.duplicated().sum()

13

In [26]:
duplicated_rows = df[df.duplicated()]
duplicated_rows

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
135,Machine Learning Engineer,$90K-$109K (Glassdoor est.),Role Description\nTriplebyte screens and evalu...,3.2,Triplebyte\n3.2,Remote,"San Francisco, CA",51 to 200 employees,2015,Company - Private,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,-1
136,Senior Data Engineer,$90K-$109K (Glassdoor est.),Lendio is looking to fill a position for a Sen...,4.9,Lendio\n4.9,"Lehi, UT","Lehi, UT",201 to 500 employees,2011,Company - Private,Lending,Finance,$50 to $100 million (USD),-1
358,Data Scientist,$122K-$146K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
359,Data Scientist,$122K-$146K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
360,Data Scientist,$122K-$146K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
361,Data Scientist,$122K-$146K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
362,Data Scientist,$122K-$146K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
389,Data Scientist,$110K-$163K (Glassdoor est.),"Job Description\nAs a Data Scientist, you will...",-1.0,HireAi,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
496,Data Scientist,$95K-$119K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
497,Data Scientist,$95K-$119K (Glassdoor est.),Job Overview: The Data Scientist is a key memb...,-1.0,Hatch Data Inc,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1


In [27]:
# Dropping duplicates
df = df.drop_duplicates()

#### Remove unnecessary columns
This enables us to eliminate columns that don't add value, making the data easier to work with.

In [28]:
# Drop unnecessary columns
df = df.drop(columns =["Headquarters", "Founded", "Competitors"], axis = 1)
df.head()

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Type of ownership,Industry,Sector,Revenue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY",1001 to 5000 employees,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA",5001 to 10000 employees,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD)
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA",1001 to 5000 employees,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD)
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA",501 to 1000 employees,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD)
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY",51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable


In [30]:
# Datatypes in the dataset
df.dtypes

Job Title             object
Salary Estimate       object
Job Description       object
Rating               float64
Company Name          object
Location              object
Size                  object
Type of ownership     object
Industry              object
Sector                object
Revenue               object
dtype: object

Most of the columns are already in their appropriate data types, eliminating the need for conversion.

### Standardize Data Format
We standardize data formats to ensure consistency, facilitates comaprison, improve data integrity, enhances data usability and supports automation of data processing and analysis tasks.

#### Job Title

In [None]:
# Random values in the column
df["Job Title"].sample(10)

index
166                                  Senior Data Analyst
524                                       Data Scientist
69                     Data Scientist - Machine Learning
599                                       Data Scientist
548          Data Engineer (Analytics, SQL, Python, AWS)
347                                Data Engineer - Kafka
612                                Senior Data Scientist
348                                       Data Scientist
254                                       Data Scientist
391    Software Engineer (Data Scientist, C,C++,Linux...
Name: Job Title, dtype: object

In [31]:
def clean_title(value):
  """
  Clean and format job title

Parameters:
value (str): The raw job title

Returns:
str: The cleaned and formatted job title
"""

  clean_value = re.split('[-/–(,]', value)[0]
  return clean_value.title()

# Apply the function to the column
df["Job Title"] = df["Job Title"].apply(clean_title)

In [33]:
# Verifying the update
df["Job Title"].sample(10)

index
403          Sr. Data Analyst
347            Data Engineer 
363     Senior Data Scientist
204    Senior Data Scientist 
74     Purification Scientist
255            Data Scientist
492            Data Scientist
434            Data Scientist
10             Data Scientist
150            Data Scientist
Name: Job Title, dtype: object

The job titles require further refinement to ensure consistency. Specifically, prefixes such as 'Sr', 'Senior', 'Jr', and 'Junior' need to be standardized. The goal is to uniformly format these prefixes and append them to the corresponding job titles, ensuring a consistent naming convention across the dataset.

In [34]:
def title_category(text):
  """
   Function cleans and categorizes job titles
   by removing unnecessary designations and adding uniform prefixes.

Parameters:
text (str): The raw job title

Returns:
str: The cleaned and categorized job title
  """
  text = text.strip()  # Removes leading/trailing spaces

# Check for and remove Senior or Junior designations
  senior_keywords = ["Sr.", "Sr", "(Sr.)", "Senior"]
  junior_keywords = ["Jr.", "Jr", "(Jr.)", "Junior"]

  is_senior = any(substring in text for substring in senior_keywords)
  is_junior = any(substring in text for substring in junior_keywords)

# Remove the senior/junior designations from the text
  for keyword in senior_keywords + junior_keywords:
      text = re.sub(re.escape(keyword), '', text, flags=re.IGNORECASE)

      text = text.strip()  # Remove any extra spaces after replacements

# Add the Senior or Junior prefix if necessary
  if is_senior:
      return "Senior " + text
  elif is_junior:
      return "Junior " + text
  else:
      return text

# Apply the function to the Job Title column
df["Job Title"] = df["Job Title"].apply(title_category)

In [36]:
# Verify update
df["Job Title"].sample(10)

index
235                          Data Scientist
344                          Data Scientist
307                          Data Scientist
345    Senior Business Intelligence Analyst
379                          Data Scientist
124           Business Intelligence Analyst
374                               Scientist
315                          Data Scientist
212                          Data Scientist
300                          Data Scientist
Name: Job Title, dtype: object

#### Salary Estimate

In [None]:
# Random values in the column
df["Salary Estimate"].sample(5)

index
45     $75K-$131K (Glassdoor est.)
74     $79K-$131K (Glassdoor est.)
621    $87K-$141K (Glassdoor est.)
199    $79K-$106K (Glassdoor est.)
432    $79K-$133K (Glassdoor est.)
Name: Salary Estimate, dtype: object

In [None]:
# Unique values
df["Salary Estimate"].unique()

array(['$137K-$171K (Glassdoor est.)', '$75K-$131K (Glassdoor est.)',
       '$79K-$131K (Glassdoor est.)', '$99K-$132K (Glassdoor est.)',
       '$90K-$109K (Glassdoor est.)', '$101K-$165K (Glassdoor est.)',
       '$56K-$97K (Glassdoor est.)', '$79K-$106K (Glassdoor est.)',
       '$71K-$123K (Glassdoor est.)', '$90K-$124K (Glassdoor est.)',
       '$91K-$150K (Glassdoor est.)', '$141K-$225K (Glassdoor est.)',
       '$145K-$225K(Employer est.)', '$79K-$147K (Glassdoor est.)',
       '$122K-$146K (Glassdoor est.)', '$112K-$116K (Glassdoor est.)',
       '$110K-$163K (Glassdoor est.)', '$124K-$198K (Glassdoor est.)',
       '$79K-$133K (Glassdoor est.)', '$69K-$116K (Glassdoor est.)',
       '$31K-$56K (Glassdoor est.)', '$95K-$119K (Glassdoor est.)',
       '$212K-$331K (Glassdoor est.)', '$66K-$112K (Glassdoor est.)',
       '$128K-$201K (Glassdoor est.)', '$138K-$158K (Glassdoor est.)',
       '$80K-$132K (Glassdoor est.)', '$87K-$141K (Glassdoor est.)',
       '$92K-$155K (Glassdo

Here, we convert salary data to numerical format by:

- Splitting salary ranges into separate minimum and maximum columns
- Replacing 'K' with '000' to represent thousands
- Converting the data type to numerical (int or float) for analysis

In [37]:
# Splitting Salary Estimate Column
df[["Minimum Salary ($)", "Maximum Salary ($)"]] = df["Salary Estimate"].str.split("-", n =2, expand = True)
df.head()

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Type of ownership,Industry,Sector,Revenue,Minimum Salary ($),Maximum Salary ($)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Senior Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY",1001 to 5000 employees,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,$137K,$171K (Glassdoor est.)
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA",5001 to 10000 employees,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),$137K,$171K (Glassdoor est.)
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA",1001 to 5000 employees,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),$137K,$171K (Glassdoor est.)
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA",501 to 1000 employees,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),$137K,$171K (Glassdoor est.)
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY",51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,$137K,$171K (Glassdoor est.)


In [38]:
# Drop Salary Estimate column to avoid redundancy
df = df.drop(columns= ["Salary Estimate"])

#### Minimum Salary

In [39]:
# Unique values
df["Minimum Salary ($)"].unique()

array(['$137K', '$75K', '$79K', '$99K', '$90K', '$101K', '$56K', '$71K',
       '$91K', '$141K', '$145K', '$122K', '$112K', '$110K', '$124K',
       '$69K', '$31K', '$95K', '$212K', '$66K', '$128K', '$138K', '$80K',
       '$87K', '$92K', '$105K'], dtype=object)

In [40]:
def min_salary(value, k = 1000):
  """
  This function converts minimum salary to a numerical value.

Parameters:
value (str): The raw minimum salary
k (int): The multiplier for 'K' (default: 1000)

Returns:
int: The cleaned and converted minimum salary

  """
  # Remove unwanted characters using regex
  min_col = re.sub(r"\$", "", value)

  # Convert to integer and multiply by k if 'K' is present
  if "K" in value:
    min_col = int(min_col.replace("K", "")) * k
  else:
    min_col = int(min_col)
  return min_col


# Apply the function to the Minimum Salary column
df["Minimum Salary ($)"] = df["Minimum Salary ($)"].apply(min_salary)

# Display the unique values in the Minimum Salary column
df["Minimum Salary ($)"].unique()

array([137000,  75000,  79000,  99000,  90000, 101000,  56000,  71000,
        91000, 141000, 145000, 122000, 112000, 110000, 124000,  69000,
        31000,  95000, 212000,  66000, 128000, 138000,  80000,  87000,
        92000, 105000])

In [41]:
def max_salary(value, k = 1000):
  """
    This function converts maximum salary to a numerical value.

    Parameters:
    value (str): The raw maximum salary
    k (int): The multiplier for 'K' (default: 1000)

    Returns:
    int: The cleaned and converted maximum salary
  """
  pattern = r"\([^)]*\)|[^\dK]"
  max_col = re.sub(pattern, "", value)

  # Convert to integer and multiply by k if 'K' is present
  if "K" in value:
     max_col = int(max_col.replace("K", "")) * k
  else:
    max_col = int(max_col)
  return max_col


# Apply the function to the Maximum Salary column
df["Maximum Salary ($)"] = df["Maximum Salary ($)"].apply(max_salary)

# Display unique values
df["Maximum Salary ($)"].unique()

array([171000, 131000, 132000, 109000, 165000,  97000, 106000, 123000,
       124000, 150000, 225000, 147000, 146000, 116000, 163000, 198000,
       133000,  56000, 119000, 331000, 112000, 201000, 158000, 141000,
       155000, 167000])

#### Job Description

In [None]:
# Random values
df["Job Description"].sample(10)

index
660    Location: Redmond, WA\nClient: Microsoft (Un-M...
520    Role: Data ScientistÂ\n\nLocation: Washington,...
318    Analytics - Business Assurance Data Analyst (C...
514    Please review the job details below.\nWe are l...
201    Company information\n\nCoverent is a specializ...
143    A chance to provide active support to our spon...
452    *Introduction**As a Data Scientist at IBM, you...
303    Role Description\nAs a data scientist at Tripl...
90     Do you have expertise in, and passion for appl...
426    Company information\n\nCoverent is a specializ...
Name: Job Description, dtype: object

In [None]:
# Viewing index 4 before clean up
df["Job Description"].loc[5]

"About Us:\n\nHeadquartered in beautiful Santa Barbara, HG Insights is the global leader in technology intelligence. HG Insights uses advanced data science methodologies to help the world's largest technology firms and the fastest growing companies accelerate their sales, marketing, and strategy efforts.\n\nWe offer a casual yet professional environment. Get your sweat on at one of our fitness classes or go for a run along the beach which is two blocks away. You can find employees riding bikes to lunch in the funk zone or hanging out in one of our collaboration spaces. We are passionate about our jobs with a get-it-done attitude, yet we don't take ourselves too seriously.\n\nWhat You'll Do:\n\nWe are looking for a data scientist with software development or data engineering background to join our research team which reports directly to the CTO. We are a rapidly growing company with small focused engineering teams that deliver innovative features to a fast growing market. We build big-d

In [63]:
def remove_newlines(x):
  """
  This funtion removes all new lines

  Prameter:
  x(str): The raw text
  Returns:
  text(str): cleaned text
  """
  cleaned_col = re.sub(r"^.*?\n", "",x) # removes everything before the first newline
  cleaned_col = re.sub(r"\n", " ",cleaned_col) # removes the second newline and spaces
  return cleaned_col.title()

# Apply function to job description column
df["Job Description"] = df["Job Description"].apply(remove_newlines)

# verifying the change
df["Job Description"].sample(5)

index
117     Client Group Technology Creates And Supports ...
266      The Data Scientist Is A Non-Supervisory Posi...
312    We Are Proud Of Our Passionate, High-Performan...
104    Intuit Is Hiring A Senior Data Scientist To Fo...
548     Solid Reputation, Passionate People And Endle...
Name: Job Description, dtype: object

In [50]:
# Verifying the effectiveness of change in index 4
df["Job Description"][5]

" Headquartered in beautiful Santa Barbara, HG Insights is the global leader in technology intelligence. HG Insights uses advanced data science methodologies to help the world's largest technology firms and the fastest growing companies accelerate their sales, marketing, and strategy efforts.  We offer a casual yet professional environment. Get your sweat on at one of our fitness classes or go for a run along the beach which is two blocks away. You can find employees riding bikes to lunch in the funk zone or hanging out in one of our collaboration spaces. We are passionate about our jobs with a get-it-done attitude, yet we don't take ourselves too seriously.  What You'll Do:  We are looking for a data scientist with software development or data engineering background to join our research team which reports directly to the CTO. We are a rapidly growing company with small focused engineering teams that deliver innovative features to a fast growing market. We build big-data systems utiliz

#### Job Rating

In [None]:
# Unique values
df["Rating"].unique()

array([ 3.1,  4.2,  3.8,  3.5,  2.9,  3.9,  4.4,  3.6,  4.5,  4.7,  3.7,
        3.4,  4.1,  3.2,  4.3,  2.8,  5. ,  4.8,  3.3,  2.7,  2.2,  2.6,
        4. ,  2.5,  4.9,  2.4, -1. ,  2.3,  4.6,  3. ,  2.1,  2. ])

In [64]:
def clean_rating(col_value):
  """
  Cleans and replaces invalid ratings with NaN

Parameters:
col_value (float): The rating value

Returns:
 The cleaned rating value
 """

  if col_value == -1.0:
    return pd.NA
  else:
    return col_value

# Apply function to column
df["Rating"] = df["Rating"].apply(clean_rating)

# Unique values
df["Rating"].unique()

array([3.1, 4.2, 3.8, 3.5, 2.9, 3.9, 4.4, 3.6, 4.5, 4.7, 3.7, 3.4, 4.1,
       3.2, 4.3, 2.8, 5.0, 4.8, 3.3, 2.7, 2.2, 2.6, 4.0, 2.5, 4.9, 2.4,
       <NA>, 2.3, 4.6, 3.0, 2.1, 2.0], dtype=object)

#### Company Name

In [None]:
# Random rows in company name column
df["Company Name"].sample(20)

index
494             Tygart Technology, Inc\n4.7
356                   Nolij Consulting\n3.9
472                    Western Digital\n3.5
137                            Upstart\n4.2
462                       Numeric, LLC\n3.2
69                              CareDx\n2.5
142                   Tempo Automation\n3.3
175                        Tempus Labs\n3.3
279          Underwriters Laboratories\n3.3
232                   BWX Technologies\n3.3
633    Central Business Solutions, Inc\n3.0
294                      Colony Brands\n3.7
185                         MassMutual\n3.7
219                        Tempus Labs\n3.3
559               E3 Federal Solutions\n4.5
148                     GNS Healthcare\n2.9
357                          Hatch Data Inc
365                             Takeda\n3.7
318    GreatAmerica Financial Services\n4.6
324              Intellectual Ventures\n3.3
Name: Company Name, dtype: object

In [65]:
def clean_company_name(name):
  """
  function cleans and formats company names.

  Parameters:
  name (str): The raw company name

  Returns:
  str: The cleaned and formatted company name
  """
  company_name = re.sub(r"\n.*","",name).title()
  return company_name

  # Apply function to column
df["Company Name"] = df["Company Name"].apply(clean_company_name)

# Display first 5 rows
df["Company Name"].head()

index
0           Healthfirst
1               Mantech
2        Analysis Group
3               Inficon
4    Affinity Solutions
Name: Company Name, dtype: object

#### Size

In [None]:
# Unique values in size
df["Size"].unique()

array(['1001 to 5000 employees', '5001 to 10000 employees',
       '501 to 1000 employees', '51 to 200 employees', '10000+ employees',
       '201 to 500 employees', '1 to 50 employees', '-1', 'Unknown'],
      dtype=object)

We are to remove unnecessary words and characters (e.g., "employees", "to")

In [66]:
# Filter -1(negative 1) and unknown rows
df_filter = df[["Size"]][df["Size"].isin(["Unknown", "-1"])]
print("Number of rows:",len(df_filter))

# Display
print(df_filter)

# Alternatively
#df_filter = df[(df["Size"]=="Unknown")|(df["Size"]=="-1")]


Number of rows: 33
          Size
index         
154         -1
158         -1
189    Unknown
193    Unknown
258    Unknown
261    Unknown
274    Unknown
282    Unknown
285    Unknown
308    Unknown
351         -1
357         -1
388         -1
409    Unknown
424    Unknown
430    Unknown
444    Unknown
459         -1
495         -1
513    Unknown
519         -1
524    Unknown
555         -1
568    Unknown
595    Unknown
613         -1
615    Unknown
650         -1
656         -1
657         -1
660         -1
668         -1
669         -1


In [67]:
def clean_size(text):
  """
  function to clean and format company size data.

  Parameters:
  text (str): The raw company size

  Returns:
  clean_col: The cleaned company size or NaN if unknown
  """

  if text in ["Unknown", "-1"]:
    return pd.NA
  else:
    clean_col = text.replace("employees", "").replace("to", "-").replace(" ", "")
    return clean_col

# Applying the function
df["Size"] = df["Size"].apply(clean_size)

# Display first 5 rows
df["Size"].head()

index
0     1001-5000
1    5001-10000
2     1001-5000
3      501-1000
4        51-200
Name: Size, dtype: object

#### Type of Ownership

In [None]:
# Random Values
df["Type of ownership"].sample(5)

index
436    Company - Private
382             Contract
387     Company - Public
225    Company - Private
534    Company - Private
Name: Type of ownership, dtype: object

In [None]:
# Unique values
df["Type of ownership"].unique()

array(['Nonprofit Organization', 'Company - Public',
       'Private Practice / Firm', 'Company - Private', 'Government',
       'Subsidiary or Business Segment', 'Other Organization', '-1',
       'Unknown', 'Hospital', 'Self-employed', 'College / University',
       'Contract'], dtype=object)

We have to filter out rows with (-1) values

In [None]:
# Filter out rows
filter_type = df[["Type of ownership"]][df["Type of ownership"].isin(["-1"])]

# Print
print("Number of rows",len(filter_type))
print(filter_type)

Number of rows 16
      Type of ownership
index                  
154                  -1
158                  -1
351                  -1
357                  -1
388                  -1
459                  -1
495                  -1
519                  -1
555                  -1
613                  -1
650                  -1
656                  -1
657                  -1
660                  -1
668                  -1
669                  -1


In [68]:
def clean_type(type_col):
  """
  function cleans and formats the type of ownership data.

  Parameters:
  type_col (str): The raw type of ownership

  Returns:
 col_strip: The cleaned type of ownership or NaN if unknown
 """

  if type_col in ["-1"]:
    return pd.NA
  else:
    return type_col.strip()

# Apply function to column
df["Type of ownership"] = df["Type of ownership"].apply(clean_type)

# Random values
df["Type of ownership"].sample(5)

index
600    Company - Private
349    Company - Private
661     Company - Public
481    Company - Private
515           Government
Name: Type of ownership, dtype: object

In [69]:
#Unique values
df["Type of ownership"].unique()

array(['Nonprofit Organization', 'Company - Public',
       'Private Practice / Firm', 'Company - Private', 'Government',
       'Subsidiary or Business Segment', 'Other Organization', <NA>,
       'Unknown', 'Hospital', 'Self-employed', 'College / University',
       'Contract'], dtype=object)

#### Industry

In [None]:
# Random values
df["Industry"].sample(10)

index
344    Enterprise Software & Network Solutions
33                          Insurance Carriers
656                                         -1
616                                       Rail
334                  Biotech & Pharmaceuticals
261                    Advertising & Marketing
15                            Federal Agencies
612                     Research & Development
155                  Biotech & Pharmaceuticals
609                        Aerospace & Defense
Name: Industry, dtype: object

In [70]:
# Filter out the negative (-1) values
filter_indust = df[["Industry"]][df["Industry"].isin(["-1"])]

# Print
print("Number of rows:", len(filter_indust))

Number of rows: 60


In [71]:
def clean_industry(value):
  """
  Cleans and formats industry data.

  Parameters:
  value (str): The raw industry value

  Returns:
  value: The cleaned industry value or NaN if unknown
  """
  if value in ["-1"]:
    return pd.NA
  else:
    return value.title()

# Apply function to column
df["Industry"] = df["Industry"].apply(clean_industry)

# Random values to verify update
df["Industry"].sample(5)

index
265                      Consulting
460         Advertising & Marketing
351                            <NA>
385    Computer Hardware & Software
396                     It Services
Name: Industry, dtype: object

#### Sector

In [None]:
# Unique values
df["Sector"].unique()

array(['Insurance', 'Business Services', 'Manufacturing',
       'Information Technology', 'Biotech & Pharmaceuticals', 'Retail',
       'Oil, Gas, Energy & Utilities', 'Government', 'Health Care',
       'Finance', 'Aerospace & Defense', '-1',
       'Transportation & Logistics', 'Media', 'Telecommunications',
       'Real Estate', 'Travel & Tourism', 'Agriculture & Forestry',
       'Education', 'Accounting & Legal', 'Non-Profit',
       'Construction, Repair & Maintenance', 'Consumer Services'],
      dtype=object)

In [72]:
def clean_sector(col_value):
  """
  Cleans and formats sector data.

  Parameters:
  col_value (str): The raw sector value

  Returns:
  col_value: The cleaned sector value or NaN if unknown
  """
  if col_value == "-1":
    return pd.NA
  else:
    return col_value

# Apply function to column
df["Sector"] = df["Sector"].apply(clean_sector)

# Unique values to verify update
df["Sector"].unique()

array(['Insurance', 'Business Services', 'Manufacturing',
       'Information Technology', 'Biotech & Pharmaceuticals', 'Retail',
       'Oil, Gas, Energy & Utilities', 'Government', 'Health Care',
       'Finance', 'Aerospace & Defense', <NA>,
       'Transportation & Logistics', 'Media', 'Telecommunications',
       'Real Estate', 'Travel & Tourism', 'Agriculture & Forestry',
       'Education', 'Accounting & Legal', 'Non-Profit',
       'Construction, Repair & Maintenance', 'Consumer Services'],
      dtype=object)

#### Revenue

In [None]:
# Unique values
df["Revenue"].unique()

array(['Unknown / Non-Applicable', '$1 to $2 billion (USD)',
       '$100 to $500 million (USD)', '$10+ billion (USD)',
       '$2 to $5 billion (USD)', '$500 million to $1 billion (USD)',
       '$5 to $10 billion (USD)', '$10 to $25 million (USD)',
       '$25 to $50 million (USD)', '$50 to $100 million (USD)',
       '$1 to $5 million (USD)', '$5 to $10 million (USD)',
       'Less than $1 million (USD)', '-1'], dtype=object)

In [73]:
def clean_revenue(text):
  """
  Cleans and formats revenue data.

  Parameters:
  text (str): The raw revenue value

  Returns:
 text: The cleaned revenue value or NaN if unknown
 """

  if text in ["Unknown / Non-Applicable","-1"]:
    return pd.NA
  else:
    return text.replace("(USD)","").replace("to", "-").replace("Less than", "<").strip()

# Apply function to column
df["Revenue"] = df["Revenue"].apply(clean_revenue)

# Display unique values to verify change
df["Revenue"].unique()

array([<NA>, '$1 - $2 billion', '$100 - $500 million', '$10+ billion',
       '$2 - $5 billion', '$500 million - $1 billion', '$5 - $10 billion',
       '$10 - $25 million', '$25 - $50 million', '$50 - $100 million',
       '$1 - $5 million', '$5 - $10 million', '< $1 million'],
      dtype=object)

In [74]:
# Rename column
df = df.rename(columns={"Revenue": "Revenue ($)"})

In [75]:
# Viewing the cleaned dataset
df.head()

Unnamed: 0_level_0,Job Title,Job Description,Rating,Company Name,Location,Size,Type of ownership,Industry,Sector,Revenue ($),Minimum Salary ($),Maximum Salary ($)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Senior Data Scientist,The Senior Data Scientist Is Responsible For ...,3.1,Healthfirst,"New York, NY",1001-5000,Nonprofit Organization,Insurance Carriers,Insurance,,137000,171000
1,Data Scientist,Join The Top Information Technology And Analy...,4.2,Mantech,"Chantilly, VA",5001-10000,Company - Public,Research & Development,Business Services,$1 - $2 billion,137000,171000
2,Data Scientist,Analysis Group Is One Of The Largest Interna...,3.8,Analysis Group,"Boston, MA",1001-5000,Private Practice / Firm,Consulting,Business Services,$100 - $500 million,137000,171000
3,Data Scientist,Do You Have A Passion For Data And Machine Le...,3.5,Inficon,"Newton, MA",501-1000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 - $500 million,137000,171000
4,Data Scientist,Affinity Solutions / Marketing Cloud Seeks Sma...,2.9,Affinity Solutions,"New York, NY",51-200,Company - Private,Advertising & Marketing,Business Services,,137000,171000


In [76]:
# Convert dataframe to csv file.
df.to_csv('Cleaned_DS_data.csv', index=False)

In [77]:
#!pip install files

In [78]:
# Download the CSV file
from google.colab import files

# Download the CSV file
files.download('Cleaned_DS_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>