In [50]:
import pandas as pd

# Load the initial dataset
file_path = 'Preprocessed_final_dataset.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Gender,Country,Education,Job,Years_of_programming,Years_of_machine_learning,Company_size,Yearly_compensation,Language,IDE,Visualization_tool,ML_Framework,ML_algorithm,Related_activities
0,32,Man,United States of America,Master’s degree,Data Engineer,7,1,10000,112500,"Python, R, SQL","Visual Studio, PyCharm , Sublime Text","Matplotlib , Seaborn , Ggplot / ggplot2 , ...","Scikit-learn , TensorFlow , Keras , PyTo...","Linear or Logistic Regression, Convolutional N...",Analyze and understand data to influence produ...
1,37,Man,Other,Bachelor’s degree,Software Engineer,15,0,5499,17500,"Java, Javascript, Bash","Visual Studio Code (VSCode), Notepad++ , ...",D3 js,Other,Other,None of these activities are an important part...
2,32,Man,United States of America,Master’s degree,Data Scientist,7,3,624,137500,"Python, SQL, Bash",PyCharm,"Matplotlib , Seaborn , Plotly / Plotly Expr...","Scikit-learn , TensorFlow , Keras , Xgbo...","Linear or Logistic Regression, Decision Trees ...",Analyze and understand data to influence produ...
3,37,Man,Other,Doctoral degree,Data Scientist,7,2,5499,75000,"Python, SQL, Bash","Jupyter, PyCharm , Sublime Text , Vim / ...","Matplotlib , Seaborn , Altair , Bokeh","Scikit-learn , TensorFlow , Keras , PyTo...","Gradient Boosting Machines , Convolutional Neu...","MachineLearningEngineer, MachineLearningEngine..."
4,37,Man,United States of America,Doctoral degree,Academic/Research Role,1,1,24,35000,R,RStudio,Ggplot / ggplot2,Tidymodels,Linear or Logistic Regression,Analyze and understand data to influence produ...


In [43]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

data = df.copy()
# Sample categorical and numerical features
categorical_features = ['Gender', 'Country', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  
numerical_features = ['Age', 'Years_of_programming', 'Years_of_machine_learning', 'Company_size', 'Yearly_compensation']  

# One-Hot Encoding for Multi-Answer Categorical Features
mlb = MultiLabelBinarizer()
for feature in categorical_features:
    # Splitting each feature into a list of items
    data[feature] = data[feature].str.split(', ').apply(lambda x: [item.strip() for item in x])
    # Applying MultiLabelBinarizer
    encoded_features = pd.DataFrame(mlb.fit_transform(data[feature]), columns=mlb.classes_)
    # Drop the original column and add the encoded columns
    data = data.drop(columns=[feature])
    data = pd.concat([data, encoded_features], axis=1)

# Normalize Numerical Features
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Splitting the Dataset
X = data.drop(columns=['Job'])  
y = data['Job']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
evaluation_report = classification_report(y_test, y_pred)
print(evaluation_report)

                         precision    recall  f1-score   support

 Academic/Research Role       0.57      0.54      0.55       719
       Business Analyst       0.24      0.01      0.03       285
          Data Engineer       0.36      0.09      0.14       300
         Data Scientist       0.46      0.73      0.57      1541
            DataAnalyst       0.42      0.46      0.44       909
MachineLearningEngineer       0.45      0.20      0.28       524
  Other Technical Roles       0.14      0.01      0.02        83
 Product/ProjectManager       0.34      0.09      0.14       414
      Software Engineer       0.53      0.68      0.60       894

               accuracy                           0.48      5669
              macro avg       0.39      0.31      0.31      5669
           weighted avg       0.45      0.48      0.43      5669



In [44]:
# with resample
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample

data = df.copy()
# data.drop(['Country'], axis=1, inplace=True)

# Sample categorical and numerical features
categorical_features = ['Gender', 'Country', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  
# categorical_features = ['Gender', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  

numerical_features = ['Age', 'Years_of_programming', 'Years_of_machine_learning', 'Company_size', 'Yearly_compensation']  

# One-Hot Encoding for Multi-Answer Categorical Features
mlb = MultiLabelBinarizer()
for feature in categorical_features:
    # Splitting each feature into a list of items
    data[feature] = data[feature].str.split(', ').apply(lambda x: [item.strip() for item in x])
    # Applying MultiLabelBinarizer
    encoded_features = pd.DataFrame(mlb.fit_transform(data[feature]), columns=[f"{feature}_{class_}" for class_ in mlb.classes_])
    # Drop the original column and add the encoded columns
    data = data.drop(columns=[feature])
    data = pd.concat([data, encoded_features], axis=1)


# Normalize Numerical Features
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Encode Target Variable
label_encoder = LabelEncoder()
data['Job'] = label_encoder.fit_transform(data['Job'])

data = data.drop(columns=data.select_dtypes(include=['object']).columns)
data_processed = data.copy()
# drop col
data.drop(columns=['Education_I prefer not to answer',
 'Language_C#',
 'Language_Go',
 'Language_Julia',
 'Language_PHP',
 'Language_Swift',
 'IDE_IntelliJ',
 'Visualization_tool_Altair',
 'Visualization_tool_Dygraphs',
 'Visualization_tool_Highcharter',
 'Visualization_tool_Pygal',
 'ML_Framework_H2O 3',
 'ML_Framework_JAX',
 'ML_Framework_MXNet',
 'ML_algorithm_Autoencoder Networks',
 'ML_algorithm_Graph Neural Networks','Related_activities_and operationalizing data'], inplace=True)
# process after Encode
data['IDE_Visual Studio'] = data[['IDE_Visual Studio', 'IDE_Visual Studio Code (VSCode)']].max(axis=1)
data.drop('IDE_Visual Studio Code (VSCode)', axis=1, inplace=True)

# Splitting the Dataset
X = data.drop(columns=['Job'])  
y = data['Job']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Determine the largest class size in the training set
sample_size = y_train.value_counts().max()

# Create a new training set with balanced classes
X_train_balanced = pd.DataFrame()
y_train_balanced = pd.Series()

for job_class in y_train.unique():
    X_temp = X_train[y_train == job_class]
    X_sample = resample(X_temp, replace=True, n_samples=sample_size, random_state=42)
    y_sample = pd.Series([job_class] * sample_size)
    
    X_train_balanced = pd.concat([X_train_balanced, X_sample])
    y_train_balanced = pd.concat([y_train_balanced, y_sample])

# Train your model on the balanced training dataset
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_balanced, y_train_balanced)

# Evaluate the model
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

  y_train_balanced = pd.Series()


                         precision    recall  f1-score   support

 Academic/Research Role       0.51      0.59      0.55       719
       Business Analyst       0.18      0.23      0.21       285
          Data Engineer       0.21      0.24      0.22       300
         Data Scientist       0.57      0.41      0.48      1541
            DataAnalyst       0.41      0.36      0.38       909
MachineLearningEngineer       0.29      0.40      0.34       524
  Other Technical Roles       0.11      0.19      0.14        83
 Product/ProjectManager       0.23      0.27      0.25       414
      Software Engineer       0.57      0.56      0.57       894

               accuracy                           0.42      5669
              macro avg       0.34      0.36      0.35      5669
           weighted avg       0.44      0.42      0.42      5669



In [63]:
# resample & merge roles
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample

data = df.copy()
# data['Job'] = data['Job'].replace(['Business Analyst', 'Other Technical Roles', 'Product/ProjectManager'], 'Business and Project Management Roles')


# Sample categorical and numerical features
categorical_features = ['Gender', 'Country', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  
# categorical_features = ['Gender', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  

numerical_features = ['Age', 'Years_of_programming', 'Years_of_machine_learning', 'Company_size', 'Yearly_compensation']  

# One-Hot Encoding for Multi-Answer Categorical Features
mlb = MultiLabelBinarizer()
for feature in categorical_features:
    # Splitting each feature into a list of items
    data[feature] = data[feature].str.split(', ').apply(lambda x: [item.strip() for item in x])
    # Applying MultiLabelBinarizer
    encoded_features = pd.DataFrame(mlb.fit_transform(data[feature]), columns=[f"{feature}_{class_}" for class_ in mlb.classes_])
    # Drop the original column and add the encoded columns
    data = data.drop(columns=[feature])
    data = pd.concat([data, encoded_features], axis=1)


# Normalize Numerical Features
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Encode Target Variable
label_encoder = LabelEncoder()
data['Job'] = label_encoder.fit_transform(data['Job'])

data = data.drop(columns=data.select_dtypes(include=['object']).columns)
data_processed = data.copy()
# drop col
data.drop(columns=['Education_I prefer not to answer',
 'Language_C#',
 'Language_Go',
 'Language_Julia',
 'Language_PHP',
 'Language_Swift',
 'IDE_IntelliJ',
 'Visualization_tool_Altair',
 'Visualization_tool_Dygraphs',
 'Visualization_tool_Highcharter',
 'Visualization_tool_Pygal',
 'ML_Framework_H2O 3',
 'ML_Framework_JAX',
 'ML_Framework_MXNet',
 'ML_algorithm_Autoencoder Networks',
 'ML_algorithm_Graph Neural Networks','Related_activities_and operationalizing data'], inplace=True)
# process after Encode
data['IDE_Visual Studio'] = data[['IDE_Visual Studio', 'IDE_Visual Studio Code (VSCode)']].max(axis=1)
data.drop('IDE_Visual Studio Code (VSCode)', axis=1, inplace=True)

# Splitting the Dataset
X = data.drop(columns=['Job'])  
y = data['Job']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Determine the largest class size in the training set
sample_size = y_train.value_counts().max()

# Create a new training set with balanced classes
X_train_balanced = pd.DataFrame()
y_train_balanced = pd.Series()

for job_class in y_train.unique():
    X_temp = X_train[y_train == job_class]
    X_sample = resample(X_temp, replace=True, n_samples=sample_size, random_state=42)
    y_sample = pd.Series([job_class] * sample_size)
    
    X_train_balanced = pd.concat([X_train_balanced, X_sample])
    y_train_balanced = pd.concat([y_train_balanced, y_sample])

# Train your model on the balanced training dataset
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_balanced, y_train_balanced)

# Evaluate the model
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

  y_train_balanced = pd.Series()


                         precision    recall  f1-score   support

 Academic/Research Role       0.51      0.59      0.55       719
       Business Analyst       0.18      0.23      0.21       285
          Data Engineer       0.21      0.24      0.22       300
         Data Scientist       0.57      0.41      0.48      1541
            DataAnalyst       0.41      0.36      0.38       909
MachineLearningEngineer       0.29      0.40      0.34       524
  Other Technical Roles       0.11      0.19      0.14        83
 Product/ProjectManager       0.23      0.27      0.25       414
      Software Engineer       0.57      0.56      0.57       894

               accuracy                           0.42      5669
              macro avg       0.34      0.36      0.35      5669
           weighted avg       0.44      0.42      0.42      5669



In [17]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE


data = df.copy()
data['Job'] = data['Job'].replace(['Business Analyst', 'Other Technical Roles', 'Product/ProjectManager'], 'Business and Project Management Roles')
# data.drop(['Country'], axis=1, inplace=True)

# Sample categorical and numerical features
categorical_features = ['Gender', 'Country', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  
# categorical_features = ['Gender', 'Education', 'Language', 'IDE', 'Visualization_tool', 'ML_Framework', 'ML_algorithm', 'Related_activities']  

numerical_features = ['Age', 'Years_of_programming', 'Years_of_machine_learning', 'Company_size', 'Yearly_compensation']  

# One-Hot Encoding for Multi-Answer Categorical Features
mlb = MultiLabelBinarizer()
for feature in categorical_features:
    # Splitting each feature into a list of items
    data[feature] = data[feature].str.split(', ').apply(lambda x: [item.strip() for item in x])
    # Applying MultiLabelBinarizer
    encoded_features = pd.DataFrame(mlb.fit_transform(data[feature]), columns=[f"{feature}_{class_}" for class_ in mlb.classes_])
    # Drop the original column and add the encoded columns
    data = data.drop(columns=[feature])
    data = pd.concat([data, encoded_features], axis=1)


# Normalize Numerical Features
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Encode Target Variable
label_encoder = LabelEncoder()
data['Job'] = label_encoder.fit_transform(data['Job'])

data = data.drop(columns=data.select_dtypes(include=['object']).columns)
data_processed = data.copy()
# drop col
data.drop(columns=['Education_I prefer not to answer',
 'Language_C#',
 'Language_Go',
 'Language_Julia',
 'Language_PHP',
 'Language_Swift',
 'IDE_IntelliJ',
 'Visualization_tool_Altair',
 'Visualization_tool_Dygraphs',
 'Visualization_tool_Highcharter',
 'Visualization_tool_Pygal',
 'ML_Framework_H2O 3',
 'ML_Framework_JAX',
 'ML_Framework_MXNet',
 'ML_algorithm_Autoencoder Networks',
 'ML_algorithm_Graph Neural Networks','Related_activities_and operationalizing data'], inplace=True)
# process after Encode
data['IDE_Visual Studio'] = data[['IDE_Visual Studio', 'IDE_Visual Studio Code (VSCode)']].max(axis=1)
data.drop('IDE_Visual Studio Code (VSCode)', axis=1, inplace=True)
# drop <2000
for col in data.columns:
    if (data[col] == 1).sum() < 2000 and col not in ['Job', 'Age', 'Years_of_programming', 'Years_of_machine_learning', 'Company_size', 'Yearly_compensation']:
        data.drop(col, axis=1, inplace=True)
# Splitting the Dataset
X = data.drop(columns=['Job'])
y = data['Job']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Model Training using XGBoost with SMOTE-applied data
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_smote, y_train_smote)

# Model Evaluation
y_pred_xgb = xgb_model.predict(X_test)
xgb_evaluation_report = classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_)
print(xgb_evaluation_report)

                                       precision    recall  f1-score   support

               Academic/Research Role       0.55      0.54      0.55       719
Business and Project Management Roles       0.38      0.32      0.35       782
                        Data Engineer       0.23      0.15      0.18       300
                       Data Scientist       0.50      0.60      0.55      1541
                          DataAnalyst       0.43      0.40      0.41       909
              MachineLearningEngineer       0.34      0.29      0.31       524
                    Software Engineer       0.56      0.62      0.59       894

                             accuracy                           0.47      5669
                            macro avg       0.43      0.42      0.42      5669
                         weighted avg       0.46      0.47      0.46      5669



In [14]:
counts_of_ones = {col: (data[col] == 1).sum() for col in data.columns}
counts_of_ones

{'Age': 151,
 'Job': 1343,
 'Years_of_programming': 3317,
 'Years_of_machine_learning': 294,
 'Company_size': 5997,
 'Yearly_compensation': 67,
 'Gender_Man': 23504,
 'Gender_Woman': 4369,
 'Country_India': 6561,
 'Country_Other': 12326,
 'Country_United States of America': 3851,
 'Education_Bachelor’s degree': 8330,
 'Education_Doctoral degree': 5061,
 'Education_Master’s degree': 12762,
 'Language_Bash': 3632,
 'Language_C': 3870,
 'Language_C++': 4839,
 'Language_Java': 4835,
 'Language_Javascript': 5094,
 'Language_MATLAB': 3036,
 'Language_Other': 3617,
 'Language_Python': 24663,
 'Language_R': 7094,
 'Language_SQL': 14611,
 'IDE_Jupyter': 19969,
 'IDE_MATLAB': 2235,
 'IDE_Notepad++': 5783,
 'IDE_Other': 2360,
 'IDE_PyCharm': 8338,
 'IDE_RStudio': 6233,
 'IDE_Spyder': 4600,
 'IDE_Sublime Text': 3323,
 'IDE_Vim / Emacs': 2780,
 'IDE_Visual Studio': 13755,
 'Visualization_tool_Ggplot / ggplot2': 7240,
 'Visualization_tool_Matplotlib': 20831,
 'Visualization_tool_Other': 4312,
 'Visu

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28343 entries, 0 to 28342
Data columns (total 85 columns):
 #   Column                                                                                         Non-Null Count  Dtype  
---  ------                                                                                         --------------  -----  
 0   Age                                                                                            28343 non-null  float64
 1   Job                                                                                            28343 non-null  int32  
 2   Years_of_programming                                                                           28343 non-null  float64
 3   Years_of_machine_learning                                                                      28343 non-null  float64
 4   Company_size                                                                                   28343 non-null  float64
 5   Yearly_compensatio

In [40]:
df['Job'].value_counts()

Data Scientist                  7207
Software Engineer               4539
DataAnalyst                     4273
Research Scientist              2813
MachineLearningEngineer         2682
Product/ProjectManager          2144
Business Analyst                1343
Data Engineer                   1229
Teacher / professor              678
Statistician                     580
Engineer (non-software)          337
DBA/Database Engineer            251
Developer Relations/Advocacy      81
Data Architect                    81
Data Administrator                53
Developer Advocate                52
Name: Job, dtype: int64

In [54]:
data.head()

Unnamed: 0,Age,Job,Years_of_programming,Years_of_machine_learning,Company_size,Yearly_compensation,Gender_Man,Gender_Other,Gender_Woman,Country_Brazil,...,ML_algorithm_Other,ML_algorithm_Recurrent Neural Networks,ML_algorithm_Transformer Networks,Related_activities_Analyze and understand data to influence product or business decisions,Related_activities_Build and/or run the data infrastructure that my business uses for storing,Related_activities_Experimentation and iteration to improve existing ML models,Related_activities_MachineLearningEngineer,Related_activities_None of these activities are an important part of my role at work,Related_activities_Other,Related_activities_analyzing
0,0.254902,2,0.35,0.05,1.0,0.112056,1,0,0,0,...,0,0,1,1,0,0,1,0,0,0
1,0.352941,6,0.75,0.0,0.548817,0.017009,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0.254902,3,0.35,0.15,0.060144,0.137069,1,0,0,0,...,0,0,0,1,0,1,1,0,0,0
3,0.352941,3,0.35,0.1,0.548817,0.074537,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0.352941,0,0.05,0.05,0.0,0.034517,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
