###Import Library

In [1]:
!pip install tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting tensorflow==2.18.0 (from tensorflow_decision_forests)
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ydf (from tensorflow_decision_forests)
  Downloading ydf-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow==2.18.0->tensorflow_decision_forests)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.
Collecting tf-keras~=2.17 (from tensorflow_decision_forests)
  Downloading tf_keras-2.

In [2]:
import pandas as pd
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

###Load Dataset

In [3]:
file_url= 'https://drive.google.com/uc?id=15Mr-H1hx7PhQIbvAMzU1vkwJPaBIahHG'
df = pd.read_csv(file_url)

In [4]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


###Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


Identify columns with NaN

In [6]:
print("Number of NaN values ​​before preprocessing:")
print(df.isnull().sum())

Number of NaN values ​​before preprocessing:
Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64


Filling NaN in numeric column with average

In [7]:
if 'Age' in df.columns:
    df['Age'] = df['Age'].fillna(df['Age'].mean())

# Filling NaN in categorical column with mode (most frequent value)
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

if 'Education Level' in df.columns:
    df['Education Level'] = df['Education Level'].fillna(df['Education Level'].mode()[0])

if 'Job Title' in df.columns:
    df['Job Title'] = df['Job Title'].fillna(df['Job Title'].mode()[0])

Fill NaN in column with average

In [8]:
feature_columns = ["Years of Experience"]
label_column = "Salary"
df[feature_columns] = df[feature_columns].fillna(df[feature_columns].mean())

Fill NaN in the label column with default value (e.g., 0)

In [9]:
df[label_column] = df[label_column].fillna(0)

print("Number of NaN values ​​after preprocessing:")
print(df.isnull().sum())

Number of NaN values ​​after preprocessing:
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


In [10]:
df[feature_columns] = df[feature_columns].astype(float)
df[label_column] = df[label_column].astype(float)

Feature engineering

In [11]:
if 'Age' in df.columns and 'Years of Experience' in df.columns:
    df['Years of Experience_per_Age'] = df['Years of Experience'] / (df['Age'] + 1e-5)
    feature_columns.append('Years of Experience_per_Age')

Splitting the data into training and test sets

In [14]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
label_column = 'Salary'

Save data to TFRecord format for efficiency

In [15]:
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label=label_column, task=tfdf.keras.Task.REGRESSION)
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label=label_column, task=tfdf.keras.Task.REGRESSION)









###Creating and training a model

Random Forest Model

In [16]:
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, num_trees=200, max_depth=15)

Use /tmp/tmp03k4wyxi as temporary training directory


In [17]:
model.fit(train_dataset)

Reading training dataset...
Training dataset read in 0:00:05.623433. Found 300 examples.
Training model...
Model trained in 0:00:00.193244
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x7cbce942ee30>

Evaluating model performance

In [18]:
print("Evaluation on test data:")
evaluation = model.evaluate(test_dataset)
print(evaluation)

Evaluation on test data:
0.0


In [19]:
predictions = model.predict(test_dataset).flatten()
y_true = test_data[label_column].values
mae = mean_absolute_error(y_true, predictions)
r2 = r2_score(y_true, predictions)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error (MAE): 11535.52390625
R² Score: 0.8591540246964026


In [20]:
print(model.summary())

Model: "random_forest_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (6):
	Age
	Education_Level
	Gender
	Job_Title
	Years_of_Experience
	Years_of_Experience_per_Age

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.                         "Age"  0.349844 ################
    2. "Years_of_Experience_per_Age"  0.311498 ############
    3.         "Years_of_Experience"  0.309904 ############
    4.             "Education_Level"  0.195878 ###
    5.                   "Job_Title"  0.181846 ##
    6.                      "Gender"  0.155793 

Variable Importance: NUM_AS_ROOT:
    1.                         "Age" 84.000000 ################
    2.         "Years