In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load the dataset
file_path = 'My_Data.csv'
df = pd.read_csv(file_path)

In [3]:
df.describe()

Unnamed: 0,Tasks,AI models,AI_Workload_Ratio
count,4706.0,4706.0,4706.0
mean,400.708032,1817.678071,inf
std,311.564781,1086.853037,
min,1.0,0.0,0.036585
25%,161.0,1085.25,0.137271
50%,270.0,1577.5,0.199281
75%,608.75,2273.0,0.260572
max,1387.0,5666.0,inf


Replace infinite values with NaNs in the numeric columns

In [4]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

In [5]:
df.isnull().sum()

Job titiles          0
AI Impact            0
Tasks                0
AI models            0
AI_Workload_Ratio    7
Domain               0
dtype: int64

Drop rows with NaNs

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

Job titiles          0
AI Impact            0
Tasks                0
AI models            0
AI_Workload_Ratio    0
Domain               0
dtype: int64

Encode categorical columns and changed column name Job titiles to Job titles

In [8]:
label_encoder = LabelEncoder()
df['Job titles'] = label_encoder.fit_transform(df['Job titiles'].astype(str))
df['Domain'] = label_encoder.fit_transform(df['Domain'].astype(str))
df['Tasks'] = df['Tasks'].astype(str)  # Convert to string if not already

Normalize the numerical columns

In [9]:
scaler = StandardScaler()
df[['AI_Workload_Ratio']] = scaler.fit_transform(df[['AI_Workload_Ratio']])


Some data Preprocessing

In [10]:
#Ensure the AI Impact column is numeric (assuming it's continuous)
df['AI Impact'] = pd.to_numeric(df['AI Impact'].str.replace('%', ''), errors='coerce')

In [11]:
# Prepare the input and output
X = df[['Job titles', 'Tasks', 'AI models', 'AI_Workload_Ratio', 'Domain']]
y = df['AI Impact']

In [12]:
# Convert inputs to sequences
X = X.values
y = y.values


In [13]:
 # Pad sequences for equal length input to RNN
X = pad_sequences(X, padding='post')

In [14]:

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Build the RNN model

In [15]:
model = Sequential()
model.add(Embedding(input_dim=len(label_encoder.classes_), output_dim=128, input_length=X.shape[1]))
model.add(SimpleRNN(units=64, activation='relu', return_sequences=True))
model.add(SimpleRNN(units=64, activation='relu'))
model.add(Dense(1, activation='linear'))  # Assuming 'AI Impact' is a continuous value
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae','mse'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 128)            1280      
                                                                 
 simple_rnn (SimpleRNN)      (None, 5, 64)             12352     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 21953 (85.75 KB)
Trainable params: 21953 (85.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Remove accuracy as it is not relevant, as of it is regression task


In [16]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
# When compiling the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Then evaluate
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test MAE: {mae}')

Test MAE: 13.381660461425781


In [20]:
# sample prediction hv to use modal.predict

# Make predictions on the test data
predictions = model.predict(X_test)

# Display the predictions for the first 5 samples
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Predicted value: {predictions[i][0]}")
    print(f"Actual value: {y_test[i]}")
    print()


Sample 1:
Predicted value: 25.05259132385254
Actual value: 15

Sample 2:
Predicted value: 29.277006149291992
Actual value: 40

Sample 3:
Predicted value: 25.820913314819336
Actual value: 60

Sample 4:
Predicted value: 34.32492446899414
Actual value: 45

Sample 5:
Predicted value: 25.05259132385254
Actual value: 10

