### Mount Google Drive

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### List Files in Google Drive

In [10]:
!ls '/content/drive/MyDrive/CleanDatasets/'

'Copy of cleaned_BusinessAnalyst.csv'  'Copy of cleaned_dataEngineer.csv'
'Copy of cleaned_DataAnalyst.csv'      'Copy of cleaned_dataScientist.csv'


# Loading the datasets

In [11]:
import pandas as pd

# Base file path
file_path = '/content/drive/MyDrive/CleanDatasets/'

# Files to load
files = {
    "business_analyst": "Copy of cleaned_BusinessAnalyst.csv",
    "data_engineer": "Copy of cleaned_dataEngineer.csv",
    "data_analyst": "Copy of cleaned_DataAnalyst.csv",
    "data_scientist": "Copy of cleaned_dataScientist.csv"
}

# Load into dictionary of DataFrames
dfs = {name: pd.read_csv(file_path + filename) for name, filename in files.items()}

# Print summary
print("DataFrames created:")
for name, df in dfs.items():
    print(f"{name}_df → {df.shape[0]} rows × {df.shape[1]} columns")


DataFrames created:
business_analyst_df → 4092 rows × 7 columns
data_engineer_df → 2528 rows × 7 columns
data_analyst_df → 5629 rows × 7 columns
data_scientist_df → 3909 rows × 7 columns


# Merge all DataFrames into a single one

In [12]:
merged_df = pd.concat(
    dfs.values(),      # all DataFrames in dfs
    keys=dfs.keys(),   # keep track of which dataset each row came from
    names=["source"],  # add a new index level
    ignore_index=False # keep multi-index so we know the source
).reset_index(level="source").reset_index(drop=True)

## Display info

In [13]:
print("✅ All DataFrames merged successfully!")
print(f"Final merged_df shape: {merged_df.shape[0]} rows × {merged_df.shape[1]} columns")

# Show first few rows
display(merged_df.head(50))

✅ All DataFrames merged successfully!
Final merged_df shape: 16158 rows × 8 columns


Unnamed: 0,source,Job Title,Job Description,Company Name,Location,Industry,Sector,Rating
0,business_analyst,Business Analyst - Clinical & Logistics Platform,Company Overview At Memorial Sloan Kettering (...,Memorial Sloan-Kettering,"New York, NY",Health Care Services & Hospitals,Health Care,3.9
1,business_analyst,Business Analyst,We are seeking for an energetic and collaborat...,Paine Schwartz Partners,"New York, NY",Venture Capital & Private Equity,Finance,3.8
2,business_analyst,Data Analyst,"For more than a decade, Asembia has been worki...",Asembia,"Florham Park, NJ",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,3.6
3,business_analyst,"Information Security Analyst, Incident Response",Job Description Summary The Information Securi...,BD,"Franklin Lakes, NJ",Health Care Products Manufacturing,Manufacturing,3.6
4,business_analyst,Analyst - FP&A Global Revenue,Magnite is the world's largest independent sel...,Rubicon Project,"New York, NY",Internet,Information Technology,3.4
5,business_analyst,Data Analyst,Sapphire Digital seeks a dynamic and driven mi...,Sapphire Digital,"Lyndhurst, NJ",Internet,Information Technology,3.4
6,business_analyst,Investment Analyst - Graduate,About Swiss Re The Swiss Re Group is one of th...,Swiss Re,"New York, NY",Insurance Agencies & Brokerages,Insurance,3.8
7,business_analyst,IT Business Process Analysis,Bristol-Myers Squibb is a global Biopharma com...,Bristol-Myers Squibb,"Jersey City, NJ",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,3.8
8,business_analyst,Tolling Business Analyst,"At Gannett Fleming, we believe in improving th...",Gannett Fleming,"New York, NY",Architectural & Engineering Services,Business Services,4.2
9,business_analyst,Business Analyst - Risk,You'll work in our Risk practice in New York. ...,McKinsey,"New York, NY",Consulting,Business Services,4.4


# Data Cleaning

## Removing the rows containing -1

In [14]:
merged_df = merged_df[
    (merged_df['Industry'] != '-1') &
    (merged_df['Sector'] != '-1')
]
print("✅ Rows with Industry or Sector as '-1' removed successfully!")
print(f"New merged_df shape: {merged_df.shape[0]} rows × {merged_df.shape[1]} columns")

✅ Rows with Industry or Sector as '-1' removed successfully!
New merged_df shape: 13617 rows × 8 columns


In [15]:
merged_df.head(50)

Unnamed: 0,source,Job Title,Job Description,Company Name,Location,Industry,Sector,Rating
0,business_analyst,Business Analyst - Clinical & Logistics Platform,Company Overview At Memorial Sloan Kettering (...,Memorial Sloan-Kettering,"New York, NY",Health Care Services & Hospitals,Health Care,3.9
1,business_analyst,Business Analyst,We are seeking for an energetic and collaborat...,Paine Schwartz Partners,"New York, NY",Venture Capital & Private Equity,Finance,3.8
2,business_analyst,Data Analyst,"For more than a decade, Asembia has been worki...",Asembia,"Florham Park, NJ",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,3.6
3,business_analyst,"Information Security Analyst, Incident Response",Job Description Summary The Information Securi...,BD,"Franklin Lakes, NJ",Health Care Products Manufacturing,Manufacturing,3.6
4,business_analyst,Analyst - FP&A Global Revenue,Magnite is the world's largest independent sel...,Rubicon Project,"New York, NY",Internet,Information Technology,3.4
5,business_analyst,Data Analyst,Sapphire Digital seeks a dynamic and driven mi...,Sapphire Digital,"Lyndhurst, NJ",Internet,Information Technology,3.4
6,business_analyst,Investment Analyst - Graduate,About Swiss Re The Swiss Re Group is one of th...,Swiss Re,"New York, NY",Insurance Agencies & Brokerages,Insurance,3.8
7,business_analyst,IT Business Process Analysis,Bristol-Myers Squibb is a global Biopharma com...,Bristol-Myers Squibb,"Jersey City, NJ",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,3.8
8,business_analyst,Tolling Business Analyst,"At Gannett Fleming, we believe in improving th...",Gannett Fleming,"New York, NY",Architectural & Engineering Services,Business Services,4.2
9,business_analyst,Business Analyst - Risk,You'll work in our Risk practice in New York. ...,McKinsey,"New York, NY",Consulting,Business Services,4.4


In [16]:
# Save the merged dataframe to a new CSV file in Google Drive
output_path = '/content/drive/MyDrive/CleanDatasets/merged_data.csv'
merged_df.to_csv(output_path, index=False)
print(f"✅ Merged dataset saved to: {output_path}")

✅ Merged dataset saved to: /content/drive/MyDrive/CleanDatasets/merged_data.csv
