<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/groupassignment2025retry2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Data Preprocessing

import pandas as pd
import numpy as np

# Load the dataset (ensure 'train.csv' is in the working directory)
df = pd.read_csv('train.csv')

# Print the initial shape of the dataset
print("Initial shape of train data:", df.shape)

# Drop any rows with missing values
df = df.dropna()
print("Shape after dropping missing values:", df.shape)

# Check if any force_meas values are non-positive
if (df['force_meas'] <= 0).any():
    raise ValueError("There are non-positive values in force_meas!")
else:
    print("All force_meas values are positive.")

# Display basic information and the first 5 rows of the dataset
print("Dataset info:")
print(df.info())
print("First 5 rows:")
print(df.head())


Initial shape of train data: (14994, 35)
Shape after dropping missing values: (14993, 35)
All force_meas values are positive.
Dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 14993 entries, 0 to 14992
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   tappingsteelgrade     14993 non-null  object 
 1   force_meas            14993 non-null  float64
 2   speed                 14993 non-null  float64
 3   entrytemperature      14993 non-null  float64
 4   entrytemperaturebot   14993 non-null  float64
 5   entrytemperaturecore  14993 non-null  float64
 6   entrytemperaturetop   14993 non-null  float64
 7   entrythickness        14993 non-null  float64
 8   entrywidth            14993 non-null  float64
 9   exitthickness         14993 non-null  float64
 10  zeropoint             14993 non-null  float64
 11  radius                14993 non-null  float64
 12  pctal                 14993 non-null 

In [None]:
# Select features by dropping the target column 'force_meas'
features = df.drop(columns=['force_meas'])
target = df['force_meas']

# Convert the categorical column 'tappingsteelgrade' using one-hot encoding
if 'tappingsteelgrade' in features.columns:
    features = pd.get_dummies(features, columns=['tappingsteelgrade'], drop_first=True)

print("Features shape after encoding:", features.shape)


from sklearn.preprocessing import StandardScaler

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

print("Feature scaling completed. Sample of scaled features:")
print(X_scaled[:5])


Features shape after encoding: (14993, 56)
Feature scaling completed. Sample of scaled features:
[[-1.20815045e+00  1.62317851e+00  1.10172575e+00  1.76238659e+00
   9.87948550e-01  1.45623695e+00  3.61472968e-01  1.28464984e+00
  -2.83133583e-01  1.73464961e+00 -8.51661554e-01 -1.95072152e+00
   1.28237237e+00  2.64962246e-01  0.00000000e+00 -4.82318706e-01
  -3.95594767e-01  3.58462510e-01  0.00000000e+00 -2.04614134e+00
  -8.06154473e-01 -8.24254455e-01 -1.19837786e+00 -3.61909645e-01
  -1.46499377e+00  1.43656069e+00  1.66689327e+00 -1.48847140e+00
  -7.13871890e-01 -9.52200096e-01 -1.07879267e+00 -1.65922425e+00
   4.48053053e-01 -4.08684453e-02  3.14958465e+00 -3.00138110e-01
  -6.74989919e-02 -1.63422278e-01 -3.16459922e-02 -1.63359226e-02
  -1.25915578e-01 -1.50482932e-01 -3.93355942e-01 -3.45032780e-01
  -3.10709877e-01 -2.82072849e-01 -2.28342617e-01 -3.46699198e-02
  -1.09612270e-01 -1.63359226e-02 -3.65477450e-02 -1.15504710e-02
  -2.73717255e-01 -1.82647259e-02 -3.97865908

In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Split the scaled features and target into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X_scaled, df['force_meas'], test_size=0.2, random_state=42)

# Define 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)


lr = LinearRegression()
lr_cv_scores = cross_val_score(lr, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv)
print("Linear Regression CV MAE: {:.4f}".format(-np.mean(lr_cv_scores)))

# -------------------------------
# Decision Tree Regressor with Grid Search
# -------------------------------
dt = DecisionTreeRegressor(random_state=42)
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
dt_grid = GridSearchCV(dt, dt_param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
dt_grid.fit(X_train, y_train)
print("Decision Tree best params:", dt_grid.best_params_)
print("Decision Tree CV MAE: {:.4f}".format(-dt_grid.best_score_))

# -------------------------------
# Random Forest Regressor with Grid Search
# -------------------------------
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(rf, rf_param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print("Random Forest best params:", rf_grid.best_params_)
print("Random Forest CV MAE: {:.4f}".format(-rf_grid.best_score_))

# -------------------------------
# Gradient Boosting Regressor with Grid Search
# -------------------------------
gb = GradientBoostingRegressor(random_state=42)
gb_param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}
gb_grid = GridSearchCV(gb, gb_param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
gb_grid.fit(X_train, y_train)
print("Gradient Boosting best params:", gb_grid.best_params_)
print("Gradient Boosting CV MAE: {:.4f}".format(-gb_grid.best_score_))

# -------------------------------
# Evaluate each model on the validation set
# -------------------------------
models = {
    'Linear Regression': lr,
    'Decision Tree': dt_grid.best_estimator_,
    'Random Forest': rf_grid.best_estimator_,
    'Gradient Boosting': gb_grid.best_estimator_
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{name} Validation MAE: {mae:.4f}")
    print(f"{name} Validation RMSE: {rmse:.4f}")


Linear Regression CV MAE: 5565531.4078
Decision Tree best params: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree CV MAE: 6599853.9875
Random Forest best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest CV MAE: 5252093.7735
Gradient Boosting best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Gradient Boosting CV MAE: 4989130.7414
Linear Regression Validation MAE: 5567526.7685
Linear Regression Validation RMSE: 7214382.6856
Decision Tree Validation MAE: 6520268.7094
Decision Tree Validation RMSE: 8655189.8080
Random Forest Validation MAE: 5127791.6150
Random Forest Validation RMSE: 6684445.4882
Gradient Boosting Validation MAE: 5006398.8995
Gradient Boosting Validation RMSE: 6425564.8506


In [None]:
import pickle
# Save the fitted scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)  # Save the scaler object

# Save the best Random Forest model (obtained from GridSearchCV)
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(rf_grid.best_estimator_, f)  # Save the best RF model

# Save the best Gradient Boosting model (obtained from GridSearchCV)
with open('best_gb_model.pkl', 'wb') as f:
    pickle.dump(gb_grid.best_estimator_, f)  # Save the best GB model

# Save the training columns (features names) for later use
training_columns = features.columns.tolist()  # 'features' 是经过预处理后用于训练的 DataFrame
with open('training_columns.pkl', 'wb') as f:
    pickle.dump(training_columns, f)
print("Training columns have been saved successfully.")

Training columns have been saved successfully.


In [None]:
# ===============================
# Step 11: Final Evaluation on Test Data
# ===============================

import pandas as pd
import numpy as np
import pickle
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load test data (ensure test.csv is in your working directory)
test_df = pd.read_csv('test.csv')

# Drop missing values if any (you may adjust based on your preprocessing strategy)
test_df = test_df.dropna()

# For test data, note: do not use the 'force_pre' column for prediction, it's the baseline.
# We assume test_df contains 'force_pre' and other features including 'tappingsteelgrade'
# Also assume that 'force_meas' is present for evaluation
# If not, adjust accordingly (for final evaluation, force_meas is needed)

# Separate baseline predictions for later comparison
baseline_force_pre = test_df['force_pre']

# Separate target variable for evaluation (force_meas)
y_test = test_df['force_meas']

# ===============================
# Preprocessing: Feature Selection & Encoding
# ===============================
# Exclude the target 'force_meas' and baseline column 'force_pre'
features_test = test_df.drop(columns=['force_meas', 'force_pre'])

# One-hot encode the categorical column 'tappingsteelgrade' (using the same columns as during training)
features_test = pd.get_dummies(features_test, columns=['tappingsteelgrade'], drop_first=True)

# IMPORTANT: Align the test features with the training features.
# Load the fitted scaler (assumes scaler.pkl was saved during training)
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# To ensure that test data has the same feature columns as training data,
# we reindex the test DataFrame to match training columns.
# Assume that 'training_columns' was saved during training.
with open('training_columns.pkl', 'rb') as f:
    training_columns = pickle.load(f)

# Reindex the test features DataFrame; missing columns will be filled with zeros.
features_test = features_test.reindex(columns=training_columns, fill_value=0)

# Scale the test features using the saved scaler
X_test_scaled = scaler.transform(features_test)

# ===============================
# Load the saved models
# ===============================
# Load best Random Forest model
with open('best_rf_model.pkl', 'rb') as f:
    best_rf_model = pickle.load(f)

# Load best Gradient Boosting model
with open('best_gb_model.pkl', 'rb') as f:
    best_gb_model = pickle.load(f)

# ===============================
# Evaluate Models on Test Data
# ===============================
# Evaluate Random Forest model
start_time = time.time()
rf_predictions = best_rf_model.predict(X_test_scaled)
rf_runtime = (time.time() - start_time) / len(X_test_scaled)  # Average prediction time per sample

rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_mse = mean_squared_error(y_test, rf_predictions)

# Evaluate Gradient Boosting model
start_time = time.time()
gb_predictions = best_gb_model.predict(X_test_scaled)
gb_runtime = (time.time() - start_time) / len(X_test_scaled)

gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))
gb_mse = mean_squared_error(y_test, gb_predictions)

# Print evaluation metrics
print("Final Evaluation Metrics:")
print("Random Forest - MAE: {:.4f}, RMSE: {:.4f}, MSE: {:.4f}, Avg Runtime per sample: {:.6f} sec".format(
    rf_mae, rf_rmse, rf_mse, rf_runtime))
print("Gradient Boosting - MAE: {:.4f}, RMSE: {:.4f}, MSE: {:.4f}, Avg Runtime per sample: {:.6f} sec".format(
    gb_mae, gb_rmse, gb_mse, gb_runtime))

# Compare with baseline force_pre (if available)
# Calculate baseline evaluation metrics using force_pre column
baseline_mae = mean_absolute_error(y_test, baseline_force_pre)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_force_pre))
baseline_mse = mean_squared_error(y_test, baseline_force_pre)

print("Baseline (force_pre) - MAE: {:.4f}, RMSE: {:.4f}, MSE: {:.4f}".format(
    baseline_mae, baseline_rmse, baseline_mse))


Final Evaluation Metrics:
Random Forest - MAE: 5055189.8127, RMSE: 6575285.0083, MSE: 43234372940812.3203, Avg Runtime per sample: 0.000040 sec
Gradient Boosting - MAE: 4952478.7807, RMSE: 6351928.0018, MSE: 40346989340435.9062, Avg Runtime per sample: 0.000004 sec
Baseline (force_pre) - MAE: 2898684.1133, RMSE: 3775600.3207, MSE: 14255157781834.8516


In [None]:
# Install Keras Tuner if not already installed
!pip install keras-tuner

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import keras_tuner as kt
import numpy as np

# Define a model-building function for the tuner
def build_model(hp):
    """
    Build a DNN model with hyperparameters to tune.
    """
    model = keras.Sequential()

    # Input layer using an Input layer is preferred
    model.add(keras.Input(shape=(X_train.shape[1],)))  # X_train为训练集特征，确保在运行前定义好

    # Tune the number of layers (at least 1 layer)
    num_layers = hp.Int('num_layers', 1, 3, default=2)

    for i in range(num_layers):
        # Tune the number of units in this layer
        units = hp.Int(f'units_{i}', min_value=32, max_value=256, step=32, default=64)
        # Tune dropout rate for this layer
        dropout_rate = hp.Float(f'dropout_{i}', 0.0, 0.5, step=0.1, default=0.2)
        # Tune L2 regularization factor
        l2_reg = hp.Float(f'l2_reg_{i}', 1e-5, 1e-3, sampling='LOG', default=1e-4)

        model.add(layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(l2_reg)))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout_rate))

    # Output layer
    model.add(layers.Dense(1))

    # Tune learning rate for the optimizer
    learning_rate = hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG', default=1e-3)

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='mse',
                  metrics=['mae'])
    return model

# Set up the tuner: Here we use Hyperband search algorithm
tuner = kt.Hyperband(
    build_model,
    objective='val_mae',
    max_epochs=50,
    factor=3,
    directory='dnn_tuner_dir',
    project_name='force_prediction_tuning'
)

# Optional: Early stopping callback to stop training if no improvement
stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Perform hyperparameter search
tuner.search(X_train, y_train,
             epochs=50,
             validation_split=0.2,
             callbacks=[stop_early],
             verbose=1)

# Retrieve the best hyperparameters and build the best model
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters found:", best_hp.values)

# Build the best model and train it
best_model = tuner.hypermodel.build(best_hp)
history = best_model.fit(X_train, y_train,
                         epochs=50,
                         validation_split=0.2,
                         callbacks=[stop_early],
                         verbose=1)


best_model.save('optimized_dnn_model.h5')
print("Optimized DNN model has been saved successfully.")



Trial 90 Complete [00h 01m 18s]
val_mae: 50100368.0

Best val_mae So Far: 43296880.0
Total elapsed time: 00h 31m 35s
Best hyperparameters found: {'num_layers': 2, 'units_0': 192, 'dropout_0': 0.1, 'l2_reg_0': 0.0009807310652433297, 'units_1': 224, 'dropout_1': 0.1, 'l2_reg_1': 1.164331764071064e-05, 'learning_rate': 0.009925530546395422, 'units_2': 224, 'dropout_2': 0.1, 'l2_reg_2': 0.000458502351256499, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0076'}
Epoch 1/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 2701617107304448.0000 - mae: 50178972.0000 - val_loss: 2727466367975424.0000 - val_mae: 50450088.0000
Epoch 2/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2717974490251264.0000 - mae: 50307476.0000 - val_loss: 2725658723614720.0000 - val_mae: 50435756.0000
Epoch 3/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - l



Optimized DNN model has been saved successfully.


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import keras_tuner as kt
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

scaler_dnn_log = StandardScaler()
X_train_scaled = scaler_dnn_log.fit_transform(X_train)
X_val_scaled   = scaler_dnn_log.transform(X_val)
# scaler
with open('scaler_dnn_log.pkl', 'wb') as f:
    pickle.dump(scaler_dnn_log, f)


# Assume that X_train, y_train, X_val, and y_val are already preprocessed
# and that y_train and y_val have been log-transformed
y_train_log = np.log(y_train)
y_val_log = np.log(y_val)

# Get the number of features from X_train
input_dim = X_train.shape[1]

# Define the model-building function for the tuner
def build_advanced_model(hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_dim,)))

    # Loop to add multiple hidden layers (between 4 and 6 layers)
    for i in range(hp.Int('num_layers', 4, 6)):
        # Define the number of units in the layer (range: 32 to 512, step size: 32)
        units = hp.Int(f'units_{i}', min_value=32, max_value=512, step=32)
        # Choose the activation function: ReLU or LeakyReLU
        activation_choice = hp.Choice(f'activation_{i}', values=['relu', 'leaky_relu'])
        # Define dropout rate (from 0.0 to 0.5)
        dropout_rate = hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.1)
        # Define L2 regularization factor (using log sampling)
        l2_reg = hp.Float(f'l2_reg_{i}', min_value=1e-6, max_value=1e-3, sampling='LOG')

        # Add a Dense layer with L2 regularization
        model.add(layers.Dense(units, kernel_regularizer=regularizers.l2(l2_reg)))

        # Add the chosen activation function
        if activation_choice == 'leaky_relu':
            model.add(layers.LeakyReLU(alpha=0.1))
        else:
            model.add(layers.Activation('relu'))

        # Add a Dropout layer
        model.add(layers.Dropout(dropout_rate))

        # Optionally add Batch Normalization after each layer
        if hp.Boolean(f'batchnorm_{i}'):
            model.add(layers.BatchNormalization())

    # Output layer for predicting log(force_meas)
    model.add(layers.Dense(1))

    # Define the learning rate
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

    # Compile the model with MSE loss and MAE metric
    model.compile(optimizer=optimizer,
                  loss='mse',
                  metrics=['mae'])
    return model

# Use Keras Tuner for hyperparameter tuning with RandomSearch
tuner = kt.RandomSearch(
    build_advanced_model,
    objective='val_mae',
    max_trials=30,
    executions_per_trial=1,
    directory='advanced_dnn_tuning',
    project_name='rolling_force_advanced'
)

# Start the hyperparameter search
tuner.search(X_train, y_train_log, epochs=50, validation_data=(X_val, y_val_log))

# Retrieve the best hyperparameters
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters found:", best_hp.values)

# Build the best model using the best hyperparameters
best_advanced_model = tuner.hypermodel.build(best_hp)

# Define callbacks for learning rate reduction and early stopping
lr_reducer = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the best model
history = best_advanced_model.fit(X_train, y_train_log,
                                  epochs=100,
                                  validation_data=(X_val, y_val_log),
                                  callbacks=[lr_reducer, early_stopper])

# Save the optimized model (using HDF5 format; you can also use the native Keras format)
best_advanced_model.save("optimized_dnn_model_advanced_log.h5")

# -------------------------------
# Evaluate the optimized model on the validation set
# -------------------------------
# Predict log-transformed values on validation set and revert the transformation
y_val_pred_log = best_advanced_model.predict(X_val).flatten()
y_val_pred = np.exp(y_val_pred_log)  # Reverse log transformation
y_val_orig = np.exp(y_val_log)

# Calculate evaluation metrics on the original scale
val_mae = mean_absolute_error(y_val_orig, y_val_pred)
val_rmse = np.sqrt(mean_squared_error(y_val_orig, y_val_pred))
print("Advanced Optimized DNN (with log transformation) Validation MAE: {:.4f}".format(val_mae))
print("Advanced Optimized DNN (with log transformation) Validation RMSE: {:.4f}".format(val_rmse))


Reloading Tuner from advanced_dnn_tuning/rolling_force_advanced/tuner0.json
Best hyperparameters found: {'num_layers': 5, 'units_0': 256, 'activation_0': 'leaky_relu', 'dropout_0': 0.0, 'l2_reg_0': 2.5597162009390816e-05, 'batchnorm_0': True, 'units_1': 448, 'activation_1': 'relu', 'dropout_1': 0.2, 'l2_reg_1': 3.555769929080814e-06, 'batchnorm_1': False, 'units_2': 160, 'activation_2': 'leaky_relu', 'dropout_2': 0.30000000000000004, 'l2_reg_2': 8.068735178418917e-06, 'batchnorm_2': False, 'units_3': 384, 'activation_3': 'leaky_relu', 'dropout_3': 0.0, 'l2_reg_3': 0.0009514167995360483, 'batchnorm_3': True, 'learning_rate': 0.000582968539368285, 'units_4': 64, 'activation_4': 'relu', 'dropout_4': 0.2, 'l2_reg_4': 1.1015492294935071e-05, 'batchnorm_4': True, 'units_5': 352, 'activation_5': 'leaky_relu', 'dropout_5': 0.2, 'l2_reg_5': 1.760110534391205e-05, 'batchnorm_5': False}




Epoch 1/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 281.5579 - mae: 16.6825 - val_loss: 103.0264 - val_mae: 10.1177 - learning_rate: 5.8297e-04
Epoch 2/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 65.7511 - mae: 7.5580 - val_loss: 1.7259 - val_mae: 1.1360 - learning_rate: 5.8297e-04
Epoch 3/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 4.9721 - mae: 1.5615 - val_loss: 0.3162 - val_mae: 0.2441 - learning_rate: 5.8297e-04
Epoch 4/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 3.2626 - mae: 1.1531 - val_loss: 0.3056 - val_mae: 0.2308 - learning_rate: 5.8297e-04
Epoch 5/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 2.6963 - mae: 1.0162 - val_loss: 0.2782 - val_mae: 0.2154 - learning_rate: 5.8297e-04
Epoch 6/100
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/



[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Advanced Optimized DNN (with log transformation) Validation MAE: 2182302.7720
Advanced Optimized DNN (with log transformation) Validation RMSE: 2923777.2524


In [None]:
# -*- coding: utf-8 -*-
"""
1) Load train/test
2) Preprocess (one-hot, align columns, scale)
3) Log-transform y
4) Tune + train on scaled data
5) Final evaluate on test.csv
"""

import pandas as pd
import numpy as np
import pickle
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import keras_tuner as kt

# ----------------------------
# 1. Load data
# ----------------------------
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# ----------------------------
# 2. Split train/val
# ----------------------------
X = df.drop(columns=['force_meas'])
y = df['force_meas'].values
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Prepare test
X_test = df_test.drop(columns=['force_meas','force_pre'])
y_test = df_test['force_meas'].values
baseline_pre = df_test['force_pre'].values

# ----------------------------
# 3. One-hot encode
# ----------------------------
def onehot(df_in):
    return pd.get_dummies(df_in, columns=['tappingsteelgrade'], drop_first=True)

X_train = onehot(X_train)
X_val   = onehot(X_val)
X_test  = onehot(X_test)

# ----------------------------
# 4. Align columns
# ----------------------------
cols = X_train.columns.tolist()
with open('training_columns.pkl','wb') as f:
    pickle.dump(cols, f)

X_val   = X_val.reindex(columns=cols, fill_value=0)
X_test  = X_test.reindex(columns=cols, fill_value=0)

# ----------------------------
# 5. Scale features
# ----------------------------
scaler = StandardScaler().fit(X_train.values)
X_train_s = scaler.transform(X_train.values)
X_val_s   = scaler.transform(X_val.values)
X_test_s  = scaler.transform(X_test.values)

with open('scaler_dnn_log.pkl','wb') as f:
    pickle.dump(scaler, f)

# ----------------------------
# 6. Log-transform y
# ----------------------------
y_train_log = np.log(y_train)
y_val_log   = np.log(y_val)

# ----------------------------
# 7. Build tuner
# ----------------------------
input_dim = X_train_s.shape[1]

def build_model(hp):
    m = keras.Sequential([layers.Input(shape=(input_dim,))])
    for i in range(hp.Int('num_layers',4,6)):
        units = hp.Int(f'units_{i}',32,512,32)
        l2r   = hp.Float(f'l2_{i}',1e-6,1e-3,sampling='LOG')
        m.add(layers.Dense(units, kernel_regularizer=regularizers.l2(l2r)))
        if hp.Choice(f'act_{i}', ['relu','leaky_relu'])=='leaky_relu':
            m.add(layers.LeakyReLU())
        else:
            m.add(layers.Activation('relu'))
        m.add(layers.Dropout(hp.Float(f'drop_{i}',0,0.5,0.1)))
        if hp.Boolean(f'bn_{i}'):
            m.add(layers.BatchNormalization())
    m.add(layers.Dense(1))
    lr = hp.Float('lr',1e-4,1e-2,sampling='LOG')
    m.compile(optimizer=keras.optimizers.Adam(lr),
              loss='mse', metrics=['mae'])
    return m

tuner = kt.RandomSearch(
    build_model, objective='val_mae',
    max_trials=30, executions_per_trial=1,
    directory='advanced_dnn_tuning', project_name='rolling_force'
)
tuner.search(X_train_s, y_train_log,
             validation_data=(X_val_s, y_val_log),
             epochs=50,
             callbacks=[keras.callbacks.EarlyStopping('val_mae',patience=5)])

best_hp = tuner.get_best_hyperparameters(1)[0]
print("Best HP:", best_hp.values)

# ----------------------------
# 8. Train best model
# ----------------------------
model = tuner.hypermodel.build(best_hp)
callbacks = [
    keras.callbacks.ReduceLROnPlateau('val_loss',factor=0.5,patience=5,verbose=1),
    keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)
]
model.fit(X_train_s, y_train_log,
          validation_data=(X_val_s,y_val_log),
          epochs=100, batch_size=256,
          callbacks=callbacks)

model.save('optimized_dnn_model_advanced_log.h5')

# ----------------------------
# 9. Final evaluation on test.csv
# ----------------------------
start = time.time()
y_pred_log = model.predict(X_test_s).flatten()
runtime = (time.time()-start)/len(X_test_s)

y_pred = np.exp(y_pred_log)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse  = mean_squared_error(y_test, y_pred)

print(f"DNN Test MAE: {mae:.4f}, RMSE: {rmse:.4f}, MSE: {mse:.4f}, time/sample: {runtime:.6f}s")

# baseline
b_mae  = mean_absolute_error(y_test, baseline_pre)
b_rmse = np.sqrt(mean_squared_error(y_test, baseline_pre))
b_mse  = mean_squared_error(y_test, baseline_pre)
print(f"Baseline MAE: {b_mae:.4f}, RMSE: {b_rmse:.4f}, MSE: {b_mse:.4f}")


Trial 30 Complete [00h 11m 18s]
val_mae: 0.07284444570541382

Best val_mae So Far: 0.05394996702671051
Total elapsed time: 03h 26m 06s
Best HP: {'num_layers': 4, 'units_0': 64, 'l2_0': 2.1646574771386216e-06, 'act_0': 'leaky_relu', 'drop_0': 0.1, 'bn_0': False, 'units_1': 128, 'l2_1': 3.997743842873214e-05, 'act_1': 'relu', 'drop_1': 0.0, 'bn_1': False, 'units_2': 480, 'l2_2': 0.0004621378949210946, 'act_2': 'relu', 'drop_2': 0.2, 'bn_2': True, 'units_3': 192, 'l2_3': 2.1648531537582487e-06, 'act_3': 'leaky_relu', 'drop_3': 0.1, 'bn_3': True, 'lr': 0.0007459622916446641, 'units_4': 192, 'l2_4': 0.00010475693111475635, 'act_4': 'relu', 'drop_4': 0.2, 'bn_4': True, 'units_5': 256, 'l2_5': 3.714400064260753e-06, 'act_5': 'relu', 'drop_5': 0.30000000000000004, 'bn_5': True}
Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 272.5034 - mae: 16.3876 - val_loss: 67.3738 - val_mae: 8.1851 - learning_rate: 7.4596e-04
Epoch 2/100
[1m200/200[0m [32m



[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
DNN Test MAE: 2012362.2768, RMSE: 2691417.4755, MSE: 7243728027306.5898, time/sample: 0.000081s
Baseline MAE: 2898684.1133, RMSE: 3775600.3207, MSE: 14255157781834.8516


In [None]:
import json
from openai import OpenAI

client = OpenAI(
    api_key="sk-4e98eb1df2dc496dbf9cf38f1aa8320e",
    base_url="https://api.deepseek.com",
)

system_prompt = """
You are a resume parser. The user will provide raw resume text.
Please extract the following fields and output as a JSON object:
- name
- email
- phone
- education (list of strings)
- skills (list of strings)

Return ONLY valid JSON.
"""

user_prompt = """TAI CONG CHI NGUYEN (JOSEPH)  Saskatoon,  Saskatchewan | 6399983636 | taincc@gmail.c o m | linkedin.com/in/joseph - tainguyen  I  will be graduating  from  Artificial Intelligence and Data Analytics  Post - Graduate Certificate at Saskatchewan Polytechnic and seeking a  Full - time opportunity in Canada. I have been  working  for over 10 years. About 3 years ago, I became interested in the field of  machine  learning,  data engineering  and  business intelligence  then I started focusing my skills on data as well. I am passionate about  understanding user problems and solving them with available resources in the organization.  PROFESSIONAL  WORK  EXPERIENCE  Digital Integration Centre of Excellence (DICE)  ●  Research Assistant  June 2024  -  Present  o  Leverage  machine learning and  geospatial data  to  identify  potential gravel pits  o  Support data collection, cleaning, and preprocessing to prepare for analysis and model development  o  Train  and  test  machine learning models, such as ResNet, VGG16, and Random Forest, using satellite images, hillshade images,  and numerical datasets  o  Contribute to the creation and maintenance  of  PostgreSQL  database  o  Assist in designing visualizations and presenting key findings to partners  TMAM  Project  ●  Part - time Freelancer  January 2024  –  March 2024  o  Develop and maintain a web crawler using Python  o  ETL and  store crawled data  to  MongoDB  and  PostgreSQL  Geniebook  ●  Senior Engineer  April 2022  –  July 2023  o  Design and implement a real - time data pipeline to process structured data from many data sources using Kafka, ksqlDB,  Python and stored processed data in Cassandra  o  Migrate data from MySQL to PostgreSQL for Microservice and Odoo databases  o  Develop Google Forms and use Apps Script to automatically notification from Google Sheets  Amaris Consulting  -  Bolloré Transport & Logistics  -  Mizuho Bank  -  FPT Information System  ●  Experienced Consultant  –  Senior Database Developer  –  Senior Officer  –  Software Developer  September  20 09  –  April 2022  o  Confer with system analysts, developers, and other teams to translate business requirements into data modeling, database  design, efficient SQL for fast application performance  o  Enhancement of existing data migration and consolidation between various databases  o  Troubleshoot database performance related and tune complex database queries  o  Responsible for development and  maintenance of the  core banking system and operating database  o  Participated in development RPA using UIPath  o  Have knowledge of SQL Data Warehouse, ETL by SSIS and Power BI Desktop  o  Expertise in development using  MS SQL Server  and  Oracle PL/SQL  EDUCATION  Saskatchewan Polytechnic  ●  Postgraduate in Artificial Intelligence and Data Analytics  Expected  April 202 5  ●  Postgraduate in Cloud  Computing & Blockchain  April 2024  o  Award:  Saskatchewan Polytechnic Dean ’ s Honour List  University of Greenwich  ●  Bachelor of Science in Computing  20 20  National Institute of IT  ●  Diploma in Software Engineering  2009  CERTIFICATIONS  ●  Machine Learning, Deep Learning + AWS Sagemaker , Udemy  January 2025  ●  RPA  -  Process Automation using UIPATH  -  Beginner to Expert ,  Udemy  April 2022  ●  Master in Microsoft Power BI Desktop and Service ,  Udemy  March 2022  ●  Data Warehouse Developer - SQL Server/ETL/SSIS/SSAS/SSRS/T - SQL , Udemy  March 2022  A DDITIONAL  INFORMATION  ●  Languages:  Vietnamese (native),  English  (conversational proficiency)  ●  Programming Languages:  Python ,  .Net, SQL  ●  Databases:  Oracle, MS SQL, MySQL, PostgreSQL, MongoDB, Cassandra  ●  Stream processing:  Apache Kafka  ●  Cloud Services:  AWS
"""

messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}]

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=messages,
    response_format={
        'type': 'json_object'
    }
)

print(json.loads(response.choices[0].message.content))

{'name': 'TAI CONG CHI NGUYEN (JOSEPH)', 'email': 'taincc@gmail.com', 'phone': '6399983636', 'education': ['Postgraduate in Artificial Intelligence and Data Analytics, Saskatchewan Polytechnic, Expected April 2025', 'Postgraduate in Cloud Computing & Blockchain, Saskatchewan Polytechnic, April 2024', 'Bachelor of Science in Computing, University of Greenwich, 2020', 'Diploma in Software Engineering, National Institute of IT, 2009'], 'skills': ['Machine Learning', 'Data Engineering', 'Business Intelligence', 'PostgreSQL', 'Python', 'MongoDB', 'Kafka', 'ksqlDB', 'Cassandra', 'MySQL', 'Google Apps Script', 'SQL', 'Oracle PL/SQL', 'MS SQL Server', 'ETL', 'SSIS', 'Power BI', 'UIPath', 'AWS Sagemaker', 'Apache Kafka', 'AWS', '.Net', 'Vietnamese', 'English']}
