Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import ast
from collections import Counter

Load dataset

In [None]:
jobs_file = "real_dataset_android.csv"
df = pd.read_csv(jobs_file)
pd.set_option('display.max_columns', None) 

# Data Exploration

Display basic information about the dataset

In [None]:
df.head()  # Preview first few rows

In [None]:
df.info()  # Overview of dataset structure

In [None]:
df.isna().sum() # Count of missing values per column
# df.isnull().sum()

In [None]:
df['client_location'].unique()

In [None]:
df.work_hours.value_counts()

In [None]:
df.duration.value_counts()

In [None]:
df.proposals.value_counts()

In [None]:
df.experience_level.value_counts()

In [None]:
df.job_type.value_counts()

In [None]:
df.location.value_counts()

# Data Preprocessing

In [None]:
print(f'Number of Records: {df.shape[0]}')
df = df.dropna(subset='title')
df = df.drop_duplicates(subset='title', keep='first')
print(f'Number of Records (after dropping NaNs and duplicates): {df.shape[0]}')

Drop unnecessary columns

In [None]:
df.drop(columns=['location', 'client_industry', 'client_type'], inplace=True)

Drop duplicates and clean missing values

In [None]:
df.dropna(subset=["title"], inplace=True)  # Remove rows where title is missing
df.drop_duplicates(subset=["title"], inplace=True)  # Remove duplicate job titles
df = df.dropna(subset=['min_budget', 'client_location', 'proposals','interviewing', 'invites_sent', 'unanswered_invites']).reset_index(drop=True)

Filter out inconsistent job records

In [None]:
df = df[~((df['work_hours'].isna()) & 
          ~(df['max_budget'].isna() & df['duration'].isna() & df['work_hours'].isna() & 
            (df['job_type'] == 'Fixed-price')))]

Drop rows where all specified columns have missing values

In [None]:
df.dropna(subset=['min_budget', 'fixed_price'], how='all', inplace=True)
df.dropna(subset=['proposals', 'interviewing', 'invites_sent', 'unanswered_invites'], how='all', inplace=True)
df.dropna(subset=['interviewing'], inplace=True)

Convert budget columns from string to numeric

In [None]:
df['min_budget'] = df['min_budget'].replace('[\$,]', '', regex=True).astype(float)
df['max_budget'] = df['max_budget'].replace('[\$,]', '', regex=True).astype(float)

Function to calculate average budget

In [None]:
def calculate_average(row):
    """
    Calculates the average budget for a job post.

    Args:
        row (pd.Series): A row of the DataFrame containing min and max budgets.

    Returns:
        float: The average budget if max_budget exists; otherwise, min_budget.
    """
    if pd.notna(row['max_budget']):
        return (row['min_budget'] + row['max_budget']) / 2
    return row['min_budget']


In [None]:
# Apply the function to compute the average budget
df['average_budget'] = df.apply(calculate_average, axis=1)

In [None]:
df['duration'] = df.apply(lambda data: 'Not Defined' if pd.isna(data['duration']) and data['job_type'] == 'Fixed-price' else data['duration'], axis=1)
df['work_hours'] = df.apply(lambda data: 'Flexible' if pd.isna(data['work_hours']) and data['job_type'] == 'Fixed-price' else data['work_hours'], axis=1)

Mapping categorical values to numerical representations

In [None]:
work_hours_map = {
    'Less than 30 hrs/week': 'less_than_30',
    'More than 30 hrs/week': 'more_than_30',
    'Flexible': 'flexible'
}

In [None]:
duration_map = {
    'Not Defined': 0,
    '< 1 month': 1,
    '1-3 months': 2,
    '3-6 months': 3,
    '6+ months': 4
}

df['duration'] = df['duration'].map(duration_map)

In [None]:
proposal_mapping = {
    'Less than 5': 1,
    '5 to 10': 2,
    '10 to 15': 3,
    '15 to 20': 4,
    '20 to 50': 5,
    '50+': 6
}
df['proposals'] = df['proposals'].map(proposal_mapping)

In [None]:
experience_mapping = {
    'Entry': 1,
    'Intermediate': 2,
    'Expert': 3
}
df['experience_level'] = df['experience_level'].map(experience_mapping)


In [None]:
job_type_mapping = {
    'Hourly': 0,
    'Fixed-price': 1
}
df['job_type'] = df['job_type'].map(job_type_mapping)


In [None]:
country_map = {
    'USA': 'United States', 'GBR': 'United Kingdom', 'CAN': 'Canada', 'TUN': 'Tunisia',
    'ITA': 'Italy', 'FRA': 'France', 'IND': 'India', 'AUS': 'Australia', 'ARE': 'United Arab Emirates',
    'IDN': 'Indonesia', 'SGP': 'Singapore', 'PAK': 'Pakistan', 'PRT': 'Portugal', 'MEX': 'Mexico',
    'GRC': 'Greece', 'BEL': 'Belgium', 'COL': 'Colombia', 'ISR': 'Israel', 'MKD': 'North Macedonia',
    'NGA': 'Nigeria', 'ZAF': 'South Africa', 'MLT': 'Malta', 'LKA': 'Sri Lanka', 'KWT': 'Kuwait',
    'IRL': 'Ireland', 'DEU': 'Germany', 'JOR': 'Jordan', 'MYS': 'Malaysia', 'CHE': 'Switzerland',
    'CYP': 'Cyprus', 'KOR': 'South Korea', 'BRA': 'Brazil', 'SRB': 'Serbia', 'SWE': 'Sweden',
    'PRI': 'Puerto Rico', 'MKD': 'North Macedonia', 'BHR': 'Bahrain', 'TUR': 'Turkey',
    'BOL': 'Bolivia', 'PHL': 'Philippines', 'LUX': 'Luxembourg', 'NEP': 'Nepal', 'ARG': 'Argentina'
}

df['client_location'] = df['client_location'].replace(country_map)

One-hot encoding for work hours category

In [None]:
df = pd.get_dummies(df, columns=['work_hours'], prefix='work_hours', dtype='int')

Function to convert money values (K, M notation) to numerical format

In [None]:
def convert_money(value):
    """
    Converts monetary values from string format with 'K' or 'M' to float.

    Args:
        value (str): The monetary value as a string.

    Returns:
        float: The converted monetary value.
    """
    if pd.isna(value): 
        return None
    value = value.replace("$", "")  
    if "K" in value:
        return float(value.replace("K", "")) * 1_000  
    elif "M" in value:
        return float(value.replace("M", "")) * 1_000_000  
    return float(value)

In [None]:
# Apply money conversion function to the client spending column
df['client_total_spent'] = df['client_total_spent'].apply(convert_money)

In [None]:
# Drop unnecessary budget columns
df.drop(columns=['min_budget', 'max_budget', 'fixed_price'], inplace=True)

Plot histogram for client_total_spent

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df["client_total_spent"], bins=30, kde=True, color='blue')
plt.title("Histogram of client_total_spent")

plt.show()

In [None]:
log_client_total_spent = np.log(df['client_total_spent'])

Plot histogram for log_client_total_spent

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(log_client_total_spent, bins=30, kde=True, color='blue')
plt.title("Histogram of log_client_total_spent")

plt.show()

In [None]:
log_client_total_spent = log_client_total_spent.fillna(log_client_total_spent.mean())
df['reversed_log_client_total_spent'] = np.exp(log_client_total_spent)

Plot histogram for reversed_log_client_total_spent

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df["reversed_log_client_total_spent"], bins=30, kde=True, color='blue')
plt.title("Histogram of reversed_log_client_total_spent")

plt.show()

Pick 600 Records from Dataset

In [None]:
null_rows = df[(df['hires'].isnull()) & (df['active'].isnull())]
rows_to_drop = df.shape[0] - 600
df = df.drop(null_rows.head(rows_to_drop).index).reset_index(drop=True)

Preprocess Skills Column

In [None]:
df['skills'] = df['skills'].apply(ast.literal_eval)

skill_counts = Counter(skill for skills in df['skills'] for skill in skills)

skill_counts_df = pd.DataFrame(skill_counts.items(), columns=['Skill', 'Count']).sort_values(by='Count', ascending=False)

skill_counts_df.head()

In [None]:
skill_counts_df.to_csv('skill_count.csv', index=False)

Insert a new column for job category (track)

In [None]:
track_name = "android_developer"
df.insert(0, "track_name", track_name)

Final Check of Dataset

In [None]:
df.head()

In [None]:
df.info(verbose=True)

In [None]:
df.isnull().sum()

Save CSV File

In [None]:
track = 'AD'
df.to_csv(f'preprocessed_{track}.csv', index=False)

In [None]:
df.shape

# Trial of Combining DFs

In [3]:
df1 = pd.read_csv("D:\DEP-Project-main\Android_Developer_Scrapped_Data.csv")
df2 = pd.read_csv("D:\DEP-Project-main\Artificial_Intelligence_Scrapped_Data.csv")
df3 = pd.read_csv("D:\DEP-Project-main\Data_Analyst_Scrapped_Data.csv")
df4 = pd.read_csv("D:\DEP-Project-main\Javascript_Developer_Scrapped_Data.csv")

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_name          999 non-null    object 
 1   title               860 non-null    object 
 2   url                 999 non-null    object 
 3   job_type            860 non-null    object 
 4   client_location     860 non-null    object 
 5   description         860 non-null    object 
 6   min_budget          717 non-null    object 
 7   max_budget          290 non-null    object 
 8   fixed_price         860 non-null    object 
 9   experience_level    996 non-null    object 
 10  skills              999 non-null    object 
 11  proposals           858 non-null    object 
 12  interviewing        853 non-null    float64
 13  invites_sent        858 non-null    float64
 14  unanswered_invites  858 non-null    float64
 15  client_total_spent  441 non-null    object 
 16  duration

In [13]:
df1 = df1.drop(columns=['client_industry','client_type'])

In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_name          999 non-null    object 
 1   title               860 non-null    object 
 2   url                 999 non-null    object 
 3   job_type            860 non-null    object 
 4   client_location     860 non-null    object 
 5   description         860 non-null    object 
 6   min_budget          717 non-null    object 
 7   max_budget          290 non-null    object 
 8   fixed_price         860 non-null    object 
 9   experience_level    996 non-null    object 
 10  skills              999 non-null    object 
 11  proposals           858 non-null    object 
 12  interviewing        853 non-null    float64
 13  invites_sent        858 non-null    float64
 14  unanswered_invites  858 non-null    float64
 15  client_total_spent  441 non-null    object 
 16  duration

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               874 non-null    object 
 1   url                 1000 non-null   object 
 2   job_type            874 non-null    object 
 3   location            874 non-null    object 
 4   description         874 non-null    object 
 5   min_budget          705 non-null    object 
 6   max_budget          381 non-null    object 
 7   fixed_price         705 non-null    object 
 8   experience_level    997 non-null    object 
 9   skills              1000 non-null   object 
 10  duration            558 non-null    object 
 11  work_hours          512 non-null    object 
 12  proposals           869 non-null    object 
 13  interviewing        868 non-null    float64
 14  invites_sent        869 non-null    float64
 15  unanswered_invites  869 non-null    float64
 16  client_

In [16]:
df2 = df2.drop(columns=['location'])

In [18]:
track_name = "artificial_intelligence"
df2.insert(0, "track_name", track_name)

In [20]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_name          1000 non-null   object 
 1   title               874 non-null    object 
 2   url                 1000 non-null   object 
 3   job_type            874 non-null    object 
 4   description         874 non-null    object 
 5   min_budget          705 non-null    object 
 6   max_budget          381 non-null    object 
 7   fixed_price         705 non-null    object 
 8   experience_level    997 non-null    object 
 9   skills              1000 non-null   object 
 10  duration            558 non-null    object 
 11  work_hours          512 non-null    object 
 12  proposals           869 non-null    object 
 13  interviewing        868 non-null    float64
 14  invites_sent        869 non-null    float64
 15  unanswered_invites  869 non-null    float64
 16  client_

In [10]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 657 entries, 0 to 656
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          657 non-null    int64  
 1   track_name          657 non-null    object 
 2   title               657 non-null    object 
 3   url                 657 non-null    object 
 4   job_type            657 non-null    object 
 5   description         657 non-null    object 
 6   experience_level    657 non-null    object 
 7   skills              657 non-null    object 
 8   proposals           657 non-null    object 
 9   interviewing        657 non-null    float64
 10  invites_sent        657 non-null    float64
 11  unanswered_invites  657 non-null    float64
 12  client_total_spent  657 non-null    object 
 13  duration            657 non-null    int64  
 14  client_location     657 non-null    object 
 15  active              508 non-null    float64
 16  hire    

In [11]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               893 non-null    object 
 1   url                 1000 non-null   object 
 2   job_type            893 non-null    object 
 3   location            893 non-null    object 
 4   description         893 non-null    object 
 5   min_budget          745 non-null    object 
 6   max_budget          377 non-null    object 
 7   fixed_price         745 non-null    object 
 8   experience_level    1000 non-null   object 
 9   skills              1000 non-null   object 
 10  proposals           890 non-null    object 
 11  interviewing        890 non-null    float64
 12  invites_sent        890 non-null    float64
 13  unanswered_invites  890 non-null    float64
 14  client_total_spent  570 non-null    object 
 15  duration            531 non-null    object 
 16  work_ho

In [21]:
df4 = df4.drop(columns=['client_industry','client_type'])

In [23]:
track_name = "javascript_developer"
df4.insert(0, "track_name", "javascript_developer")

In [26]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_name          1000 non-null   object 
 1   title               893 non-null    object 
 2   url                 1000 non-null   object 
 3   job_type            893 non-null    object 
 4   description         893 non-null    object 
 5   min_budget          745 non-null    object 
 6   max_budget          377 non-null    object 
 7   fixed_price         745 non-null    object 
 8   experience_level    1000 non-null   object 
 9   skills              1000 non-null   object 
 10  proposals           890 non-null    object 
 11  interviewing        890 non-null    float64
 12  invites_sent        890 non-null    float64
 13  unanswered_invites  890 non-null    float64
 14  client_total_spent  570 non-null    object 
 15  duration            531 non-null    object 
 16  work_ho

In [25]:
df4 = df4.drop(columns=['location'])

In [27]:
df_combined = pd.concat([df1, df2,df4], ignore_index=True)

In [28]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_name          2999 non-null   object 
 1   title               2627 non-null   object 
 2   url                 2999 non-null   object 
 3   job_type            2627 non-null   object 
 4   client_location     2625 non-null   object 
 5   description         2627 non-null   object 
 6   min_budget          2167 non-null   object 
 7   max_budget          1048 non-null   object 
 8   fixed_price         2310 non-null   object 
 9   experience_level    2993 non-null   object 
 10  skills              2999 non-null   object 
 11  proposals           2617 non-null   object 
 12  interviewing        2611 non-null   float64
 13  invites_sent        2617 non-null   float64
 14  unanswered_invites  2617 non-null   float64
 15  client_total_spent  1574 non-null   object 
 16  durati

In [29]:
df_combined.head()

Unnamed: 0,track_name,title,url,job_type,client_location,description,min_budget,max_budget,fixed_price,experience_level,skills,proposals,interviewing,invites_sent,unanswered_invites,client_total_spent,duration,work_hours,hires,active
0,android_developer,Deploy Flutter Application with Laravel Backen...,https://www.upwork.com/jobs/Deploy-Flutter-App...,Fixed-price,Morocco,We are looking for an experienced developer to...,$15.00,,['$15.00'],Expert,"['Flutter', 'MySQL', 'Laravel', 'PHP']",,,,,,,,1.0,1.0
1,android_developer,,https://www.upwork.com/jobs/Hybrid-mobile-apps...,,,,,,,Expert,[],,,,,,,,,
2,android_developer,Create simple Web App for taxi business for cu...,https://www.upwork.com/jobs/Create-simple-Web-...,Fixed-price,Australia,Web App Development: Online Taxi Booking Syste...,$50.00,,"['$50.00', '']",Expert,"['Web Application', 'C#', 'JavaScript', 'PHP',...",5 to 10,2.0,0.0,0.0,$1.4K,,,43.0,19.0
3,android_developer,Mobile App Developer,https://www.upwork.com/jobs/Mobile-App-span-cl...,Fixed-price,Switzerland,to build a fully functional mobile application...,"$5,000.00",,"['$5,000.00', '']",Expert,"['Google Play', 'AI Mobile App Development', '...",50+,0.0,0.0,0.0,$6.4K,,,5.0,0.0
4,android_developer,UI Design for Mobile App,https://www.upwork.com/jobs/Design-for-Mobile-...,Fixed-price,India,We are seeking a talented UI designer to creat...,$100.00,,"['$100.00', '']",Intermediate,"['Mobile UI Design', 'Figma', 'Responsive Desi...",5 to 10,0.0,0.0,0.0,,,,,


In [32]:
df_combined.tail()

Unnamed: 0,track_name,title,url,job_type,client_location,description,min_budget,max_budget,fixed_price,experience_level,skills,proposals,interviewing,invites_sent,unanswered_invites,client_total_spent,duration,work_hours,hires,active
2994,javascript_developer,Bubble.io Webapp Developer,https://www.upwork.com/jobs/Bubble-Webapp-span...,Hourly,United States,Need a bubble.io developer to continue build o...,$8.00,$10.00,"['$8.00', '$10.00']",Intermediate,"['CSS', 'Database Design', 'MySQL', 'Web Devel...",15 to 20,0.0,0.0,0.0,$30K,1-3 months,Less than 30 hrs/week,69.0,8.0
2995,javascript_developer,QuickBooks API Integration and Automated Payme...,https://www.upwork.com/jobs/QuickBooks-API-Int...,Fixed-price,PAK,We are looking for a skilled professional to i...,"$5,000.00",,"['$5,000.00']",Expert,"['Web Crawling', 'Automation', 'PHP', 'API', '...",15 to 20,16.0,13.0,4.0,$3.1K,,,42.0,1.0
2996,javascript_developer,Absorb LMS & Prototype Expert Needed for Project,https://www.upwork.com/jobs/Absorb-LMS-Prototy...,Hourly,Australia,We are seeking an experienced professional wit...,$10.00,$20.00,"['$10.00', '$20.00']",Expert,"['Game Development', 'Web Development', 'Web D...",10 to 15,2.0,0.0,0.0,$50K,1-3 months,Less than 30 hrs/week,56.0,16.0
2997,javascript_developer,Laravel & Next.js Developer Needed to Fix UI a...,https://www.upwork.com/jobs/Laravel-amp-Next-s...,Fixed-price,United States,I'm looking for a developer experienced in Lar...,$30.00,,['$30.00'],Intermediate,"['CSS', 'Database Design', 'Web Development', ...",10 to 15,1.0,0.0,0.0,$202,,,7.0,0.0
2998,javascript_developer,Softr.io project,https://www.upwork.com/jobs/Softr-project_~021...,Hourly,Belgium,"Hi,\n\nWe need to make an easy client portal b...",$3.00,$20.00,"['$3.00', '$20.00']",Intermediate,"['softr', 'CSS', 'Database Design', 'Web Devel...",5 to 10,6.0,7.0,1.0,$63K,1-3 months,Less than 30 hrs/week,62.0,5.0


In [33]:
df_combined.to_csv("Upwork_Scrapped_Dataset")