# Train

This notebook trains a lightgbm model on the 3W dataset.

In [1]:
from hydrate.utils import get_config_path, DotConfig
config_path = get_config_path()
config = DotConfig(config_path)

In [2]:
from databricks.connect import DatabricksSession as SparkSession
spark = SparkSession.builder.serverless(True).getOrCreate()

In [None]:
df = spark.table(f"{config.catalog}.{config.schema}.{config.process.table}").toPandas()

In [6]:
config.train.tags

{1: {'tag': 'P-PDG', 'name': 'Pressure at the PDG', 'unit': 'Pa'},
 2: {'tag': 'P-TPT', 'name': 'Pressure at the TPT', 'unit': 'Pa'},
 3: {'tag': 'T-TPT', 'name': 'Temperature at the TPT', 'unit': 'degC'},
 4: {'tag': 'P-MON-CKP', 'name': 'Pressure upstream of the PCK', 'unit': 'Pa'},
 5: {'tag': 'T-JUS-CKP',
  'name': 'Temperature downstream of the PCK',
  'unit': 'degC'},
 6: {'tag': 'P-JUS-CKGL',
  'name': 'Pressure downstream of the GLCK',
  'unit': 'Pa'},
 7: {'tag': 'QGL', 'name': 'Gas lift flow rate', 'unit': 'sm^3/s'}}

In [5]:
indices = ['timestamp', 'well_number']
tags = [x['tag'] for x in tag_info.values()]
target = ['state']

ml_df = (
    df[tags + indices + target]
    .sort_values(['well_number','timestamp'])
    .ffill()
    .dropna()
    .copy()
)

X_df = ml_df[tags]
y_df = ml_df[target]

X_df.head(100)

NameError: name 'tag_info' is not defined

In [None]:
# Create binary hydrate target (state 8 = hydrate)
ml_df['is_hydrate'] = (ml_df['state'] == 8).astype(int)

print("Target distribution:")
print(ml_df['is_hydrate'].value_counts())
print(f"Hydrate prevalence: {ml_df['is_hydrate'].mean():.3f}")

# Check data distribution by well and state
print("\nStates present by well:")
state_by_well = ml_df.groupby('well_number')['state'].unique().apply(sorted)
for well, states in state_by_well.items():
    state_names_list = [state_names[s] for s in states]
    print(f"Well {well}: states {states} ({state_names_list})")

ml_df.head()


Unnamed: 0,state
0,0
1,0
2,0
3,0
4,0
...,...
95,0
96,0
97,0
98,0


In [None]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Prepare data for TSFresh - need well_number as id and timestamp sorted
tsfresh_df = ml_df[['well_number', 'timestamp'] + tags + ['state']].copy()
tsfresh_df = tsfresh_df.sort_values(['well_number', 'timestamp']).reset_index(drop=True)

print(f"Data shape for feature extraction: {tsfresh_df.shape}")
print(f"Wells: {tsfresh_df['well_number'].nunique()}")
print(f"Time range: {tsfresh_df['timestamp'].min()} to {tsfresh_df['timestamp'].max()}")

Data shape for feature extraction: (100, 5)
Wells: 1
Time range: 2017-08-09 16:00:26 to 2017-08-09 16:02:05


In [None]:
# Extract time series features using TSFresh with minimal feature set
# Use minimal parameters for speed - we can expand later
feature_extraction_settings = MinimalFCParameters()

# Extract features for each tag
print("Extracting time series features...")
features = extract_features(
    tsfresh_df[['well_number', 'timestamp'] + tags],
    column_id='well_number',
    column_sort='timestamp',
    default_fc_parameters=feature_extraction_settings,
    disable_progressbar=False
)

print(f"Extracted features shape: {features.shape}")
print(f"Number of features per tag: {features.shape[1] // len(tags)}")

# Create multiclass target series aligned with features (one value per well)
# Use the most frequent state for each well (mode)
target_by_well = tsfresh_df.groupby('well_number')['state'].agg(lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[-1])
y = target_by_well.loc[features.index]

print(f"Target distribution: {y.value_counts().sort_index().to_dict()}")
print(f"Target classes: {sorted(y.unique())}")
features.head()



Extracting time series features...


  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
Feature Extraction: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]

Extracted features shape: (1, 20)
Number of features per tag: 10





In [None]:
# Handle missing values and split data
X = features.fillna(0)  # Simple imputation - can improve later
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training classes: {sorted(y_train.unique())}")
print(f"Test classes: {sorted(y_test.unique())}")

# Train LightGBM model for multiclass classification
print("\nTraining LightGBM model...")
lgb_model = lgb.LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.1,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    verbose=-1,
    random_state=42
)

lgb_model.fit(X_train, y_train)
print("Model training completed!")


ValueError: With n_samples=1, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Evaluate model performance
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

y_pred = lgb_model.predict(X_test)
y_pred_proba = lgb_model.predict_proba(X_test)

print("=== Model Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[f"State_{i}" for i in sorted(y.unique())]))

# Multiclass ROC-AUC (one-vs-rest)
try:
    from sklearn.metrics import roc_auc_score
    auc_score = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
    print(f"\nWeighted ROC-AUC Score (OvR): {auc_score:.4f}")
except:
    print("\nROC-AUC not calculated (insufficient classes in test set)")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== Top 10 Most Important Features ===")
print(feature_importance.head(10))

# Show some predictions with state names
print("\n=== Sample Predictions ===")
state_names = {
    0: 'Normal', 1: 'Abrupt Increase of BSW', 2: 'Spurious Closure of DHSV',
    3: 'Severe Slugging', 4: 'Flow Instability', 5: 'Rapid Productivity Loss',
    6: 'Quick Restriction in PCK', 7: 'Scaling in PCK', 8: 'Hydrate in Production Line'
}

sample_results = pd.DataFrame({
    'actual': y_test.iloc[:10],
    'predicted': y_pred[:10],
    'actual_name': y_test.iloc[:10].map(state_names),
    'predicted_name': pd.Series(y_pred[:10]).map(state_names),
    'max_probability': np.max(y_pred_proba[:10], axis=1)
})
print(sample_results)


NameError: name 'lgb_model' is not defined