In [14]:
import pandas as pd

# Load the initial dataset
file_path = 'Preprocessed&dropped_dataset.csv'
df = pd.read_csv(file_path)

# Dropping 'Notebook' and 'Hardware' columns
df_dropped_columns = df.drop(['Notebook', 'Hardware', 'Course_platform','Media_sources'], axis=1)

# Dropping rows where 'Yearly_compensation' is missing
df_dropped_rows = df_dropped_columns.dropna(subset=['Yearly_compensation'])

# Dropping rows where 'Years_of_machine_learning' is missing
df_dropped_ml_years = df_dropped_rows.dropna(subset=['Years_of_machine_learning'])


In [15]:
# merge some options together
# Replacing all remaining missing values with 'Other'
df_filled = df_dropped_ml_years.fillna('Other')
df_filled['Years_of_machine_learning'] = df_filled['Years_of_machine_learning'].replace('MachineLearningEngineer', '0')
df_filled['Years_of_programming'] = df_filled['Years_of_programming'].replace('< 1 years', '0')
df_filled['Gender'] = df_filled['Gender'].replace(['Nonbinary', 'Prefer not to say', 'Prefer to self-describe'], 'Other')
df_filled['Education'] = df_filled['Education'].replace('Some college/university study without earning a bachelor’s degree', 'No formal education past high school')
df_filled['Education'] = df_filled['Education'].replace('No formal education past high school', 'high school')
df_filled['Education'] = df_filled['Education'].replace('Professional degree', 'Bachelor’s degree')
df_filled['Education'] = df_filled['Education'].replace('Professional doctorate', 'Doctoral degree')
# merge some jobs
df_filled['Job'] = df_filled['Job'].replace(['DBA/Database Engineer', 'Data Administrator', 'Data Architect'], 'Data Engineer')
df_filled['Job'] = df_filled['Job'].replace(['Research Scientist', 'Teacher / professor'], 'Academic/Research Role')
df_filled['Job'] = df_filled['Job'].replace('Statistician', 'Data Scientist')
df_filled['Job'] = df_filled['Job'].replace(['Engineer (non-software)', 'Developer Relations/Advocacy', 'Developer Advocate'], 'Other Technical Roles')
# try drop some jobs

In [16]:
# Find the top 9 countries
top_countries = df_filled['Country'].value_counts().nlargest(10).index

# Replace other countries with 'Other'
df_filled['Country'] = df_filled['Country'].apply(lambda x: x if x in top_countries else 'Other')

In [17]:
import re

# Function to convert compensation ranges to average values
def convert_range_to_average(compensation_range):
    if "-" in compensation_range:
        start, end = map(int, compensation_range.replace(",", "").replace("$", "").replace("<", "").replace(">", "").split("-"))
        return round((start + end + 1) / 2)
    else:
        return int(compensation_range.replace(",", "").replace("$", "").replace("<", "").replace(">", "").replace(" ", ""))

# Function to extract numbers from strings and calculate the middle value
def extract_and_calculate_middle(s):
    numbers = re.findall(r'\d+', s.replace(",", "").replace("$", "").replace("<", "").replace(">", "").replace(" ", ""))
    if len(numbers) == 2:
        start = int(numbers[0])
        end = int(numbers[1])
        middle = (start + end) // 2
        return middle
    elif len(numbers) == 1:
        return int(numbers[0])
    else:
        return None
# Function to remove text within parentheses
def remove_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text)


In [18]:
# Applying the functions to the relevant columns
df_filled['Yearly_compensation'] = df_filled['Yearly_compensation'].apply(convert_range_to_average)
df_filled['Years_of_programming'] = df_filled['Years_of_programming'].apply(extract_and_calculate_middle)
df_filled['Years_of_machine_learning'] = df_filled['Years_of_machine_learning'].apply(extract_and_calculate_middle)
df_filled['Company_size'] = df_filled['Company_size'].apply(extract_and_calculate_middle)
df_filled['Age'] = df_filled['Age'].apply(extract_and_calculate_middle)
# remove ()
df_filled['ML_algorithm'] = df_filled['ML_algorithm'].apply(remove_parentheses)

In [19]:
# Exporting the cleaned dataset
output_file_path = 'Preprocessed_final_dataset.csv'
df_filled.to_csv(output_file_path, index=False)

In [217]:
df_filled.head()

Unnamed: 0,Age,Gender,Country,Education,Job,Years_of_programming,Years_of_machine_learning,Company_size,Yearly_compensation,Language,IDE,Visualization_tool,ML_Framework,ML_algorithm,Related_activities
0,32,Man,United States of America,Master’s degree,Data Engineer,7,1,10000,112500,"Python, R, SQL","Visual Studio, PyCharm , Sublime Text","Matplotlib , Seaborn , Ggplot / ggplot2 , ...","Scikit-learn , TensorFlow , Keras , PyTo...","Linear or Logistic Regression, Convolutional N...",Analyze and understand data to influence produ...
1,37,Man,Other,Bachelor’s degree,Software Engineer,15,0,5499,17500,"Java, Javascript, Bash","Visual Studio Code (VSCode), Notepad++ , ...",D3 js,Other,Other,None of these activities are an important part...
2,32,Man,United States of America,Master’s degree,Data Scientist,7,3,624,137500,"Python, SQL, Bash",PyCharm,"Matplotlib , Seaborn , Plotly / Plotly Expr...","Scikit-learn , TensorFlow , Keras , Xgbo...","Linear or Logistic Regression, Decision Trees ...",Analyze and understand data to influence produ...
5,37,Man,Other,Doctoral degree,Data Scientist,7,2,5499,75000,"Python, SQL, Bash","Jupyter, PyCharm , Sublime Text , Vim / ...","Matplotlib , Seaborn , Altair , Bokeh","Scikit-learn , TensorFlow , Keras , PyTo...","Gradient Boosting Machines , Convolutional Neu...","MachineLearningEngineer, MachineLearningEngine..."
6,37,Man,United States of America,Doctoral degree,Academic/Research Role,1,1,24,35000,R,RStudio,Ggplot / ggplot2,Tidymodels,Linear or Logistic Regression,Analyze and understand data to influence produ...


In [226]:
df_filled['Education'].value_counts()

Master’s degree           12762
Bachelor’s degree          8330
Doctoral degree            5061
high school                1483
I prefer not to answer      707
Name: Education, dtype: int64

In [225]:
df_filled['Education'].unique()

array(['Master’s degree', 'Bachelor’s degree', 'Doctoral degree',
       'high school', 'I prefer not to answer'], dtype=object)

In [220]:
df_filled['Country'].nunique()

10