In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras import layers, models

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/MiltonDavila123/CSV/refs/heads/main/ai_job_dataset.csv")
df = df.dropna()

In [6]:
print(df)

        job_id                    job_title  salary_usd salary_currency  \
0      AI00001        AI Research Scientist       90376             USD   
1      AI00002         AI Software Engineer       61895             USD   
2      AI00003                AI Specialist      152626             USD   
3      AI00004                 NLP Engineer       80215             USD   
4      AI00005                AI Consultant       54624             EUR   
...        ...                          ...         ...             ...   
14995  AI14996            Robotics Engineer       38604             USD   
14996  AI14997  Machine Learning Researcher       57811             GBP   
14997  AI14998                 NLP Engineer      189490             USD   
14998  AI14999                   Head of AI       79461             EUR   
14999  AI15000     Computer Vision Engineer       56481             USD   

      experience_level employment_type company_location company_size  \
0                   SE     

In [7]:
X = df[['years_experience', 'remote_ratio','benefits_score','experience_level','employment_type','company_size','education_required','company_location',
        'industry']]

In [8]:
print(X)

       years_experience  remote_ratio  benefits_score experience_level  \
0                     9            50             5.9               SE   
1                     1           100             5.2               EN   
2                     2             0             9.4               MI   
3                     7            50             8.6               SE   
4                     0           100             6.6               EN   
...                 ...           ...             ...              ...   
14995                 1            50             7.9               EN   
14996                 0             0             8.2               EN   
14997                17            50             7.4               EX   
14998                 1             0             5.6               EN   
14999                 2            50             7.6               MI   

      employment_type company_size education_required company_location  \
0                  CT            M   

In [9]:
y = df['salary_usd']

In [10]:
print(y)

0         90376
1         61895
2        152626
3         80215
4         54624
          ...  
14995     38604
14996     57811
14997    189490
14998     79461
14999     56481
Name: salary_usd, Length: 15000, dtype: int64


In [11]:
variables_categoricas = ['experience_level','employment_type','company_size','education_required','company_location',
        'industry']
variable_numericas= ['years_experience', 'remote_ratio','benefits_score']

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), variable_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), variables_categoricas)
    ])

X_processed = preprocessor.fit_transform(X).toarray()

In [13]:
print(X_processed)

[[ 0.49531313  0.01265988 -1.10576909 ...  0.          0.
   0.        ]
 [-0.94727643  1.23780922 -1.58825443 ...  0.          0.
   0.        ]
 [-0.76695274 -1.21248946  1.3066576  ...  0.          0.
   0.        ]
 ...
 [ 1.93790268  0.01265988 -0.07187194 ...  0.          0.
   0.        ]
 [-0.94727643 -1.21248946 -1.31254852 ...  0.          0.
   0.        ]
 [-0.76695274  0.01265988  0.06598102 ...  1.          0.
   0.        ]]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=0)

In [15]:
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

print("Entrenando la red neuronal...")
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2)

loss, mae = model.evaluate(X_test, y_test)
print(f"\nError Medio Absoluto en el test: ${mae:.2f}")

predicciones = model.predict(X_test[:5])
print("\nComparación de Predicciones:")
for i in range(5):
    print(f"Real: {y_test.values[i]} - Predicho: {predicciones[i][0]:.2f}")

Entrenando la red neuronal...
Epoch 1/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 16488583168.0000 - mae: 113533.1328 - val_loss: 15999350784.0000 - val_mae: 111471.5547
Epoch 2/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 10092120064.0000 - mae: 80639.8125 - val_loss: 4046910464.0000 - val_mae: 43610.0781
Epoch 3/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2260114432.0000 - mae: 33825.1992 - val_loss: 1684868864.0000 - val_mae: 30453.0059
Epoch 4/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1442409728.0000 - mae: 27945.4609 - val_loss: 1249035776.0000 - val_mae: 25644.3730
Epoch 5/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1121869824.0000 - mae: 23617.7559 - val_loss: 1023470336.0000 - val_mae: 22066.0527
Epoch 6/200
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [19]:
import joblib
joblib.dump(preprocessor, 'preprocesador.joblib')
model.save('modelo_sueldos.keras')