In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Preprocess Data

In [2]:
#Loading the data
train_df = pd.read_csv('CO2_Injection_rate train.csv')
test_df = pd.read_csv('CO2_Injection_rate test.csv')

In [3]:
#Checking the data
train_df.head()

Unnamed: 0,Rand,inj_diff,Date Time,Avg_PLT_CO2VentRate_TPH,Avg_CCS1_WHCO2InjPs_psi,Avg_CCS1_WHCO2InjTp_F,Avg_CCS1_ANPs_psi,Avg_CCS1_DH6325Ps_psi,Avg_CCS1_DH6325Tp_F,Avg_VW1_WBTbgPs_psi,...,Avg_VW1_Z04D6837Ps_psi,Avg_VW1_Z04D6837Tp_F,Avg_VW1_Z03D6945Ps_psi,Avg_VW1_Z03D6945Tp_F,Avg_VW1_Z02D6982Ps_psi,Avg_VW1_Z02D6982Tp_F,Avg_VW1_Z01D7061Ps_psi,Avg_VW1_Z01D7061Tp_F,Avg_VW1_Z0910D5482Ps_psi,Avg_VW1_Z0910D5482Tp_F
0,13088,0.110069,11/22/24 4:00,0.0,1105.445866,71.965004,591.478453,3320.663664,107.016855,2232.331462,...,3111.267204,119.735317,3226.319375,120.417687,3244.126398,121.219358,3230.166178,122.555278,2399.203387,112.251281
1,22365,-0.356148,11/22/24 5:00,0.0,1096.617687,72.497477,570.684185,3318.985713,106.693605,2232.33335,...,3111.254573,119.743216,3227.475139,120.42536,3245.334563,121.228096,3230.438356,122.564759,2399.141585,112.255129
2,64032,-0.153921,11/22/24 6:00,0.0,1096.485427,73.214516,565.32236,3322.9288,106.402567,2232.451115,...,3111.326764,119.735317,3228.699017,120.425651,3246.502317,121.227157,3230.686926,122.555278,2399.120473,112.254176
3,83725,-0.435417,11/22/24 7:00,0.0,1105.19265,76.607459,620.087081,3315.569511,106.411401,2232.515324,...,3111.366945,119.737957,3229.784178,120.427391,3247.668833,121.223377,3230.968732,122.573637,2399.065426,112.258511
4,13827,0.639583,11/22/24 8:00,0.0,1114.396364,76.466836,686.325817,3315.742818,106.941482,2232.518034,...,3111.368201,119.732648,3230.823012,120.429071,3248.77557,121.221532,3231.24846,122.585576,2398.982796,112.277292


In [4]:
train_df.columns

Index(['Rand', 'inj_diff ', 'Date Time', 'Avg_PLT_CO2VentRate_TPH',
       'Avg_CCS1_WHCO2InjPs_psi', 'Avg_CCS1_WHCO2InjTp_F', 'Avg_CCS1_ANPs_psi',
       'Avg_CCS1_DH6325Ps_psi', 'Avg_CCS1_DH6325Tp_F', 'Avg_VW1_WBTbgPs_psi',
       'Avg_VW1_WBTbgTp_F', 'Avg_VW1_ANPs_psi', 'Avg_VW1_Z11D4917Ps_psi',
       'Avg_VW1_Z11D4917Tp_F', 'Avg_VW1_Z10D5001Ps_psi',
       'Avg_VW1_Z10D5001Tp_F', 'Avg_VW1_Z09D5653Ps_psi',
       'Avg_VW1_Z09D5653Tp_F', 'Avg_VW1_Z08D5840Ps_psi',
       'Avg_VW1_Z08D5840Tp_F', 'Avg_VW1_Z07D6416Ps_psi',
       'Avg_VW1_Z07D6416Tp_F', 'Avg_VW1_Z06D6632Ps_psi',
       'Avg_VW1_Z06D6632Tp_F', 'Avg_VW1_Z04D6837Ps_psi',
       'Avg_VW1_Z04D6837Tp_F', 'Avg_VW1_Z03D6945Ps_psi',
       'Avg_VW1_Z03D6945Tp_F', 'Avg_VW1_Z02D6982Ps_psi',
       'Avg_VW1_Z02D6982Tp_F', 'Avg_VW1_Z01D7061Ps_psi',
       'Avg_VW1_Z01D7061Tp_F', 'Avg_VW1_Z0910D5482Ps_psi',
       'Avg_VW1_Z0910D5482Tp_F'],
      dtype='object')

In [5]:
#Checking the null values
train_df.isnull().sum()

Rand                         0
inj_diff                     0
Date Time                    0
Avg_PLT_CO2VentRate_TPH      0
Avg_CCS1_WHCO2InjPs_psi      0
Avg_CCS1_WHCO2InjTp_F        0
Avg_CCS1_ANPs_psi            0
Avg_CCS1_DH6325Ps_psi        0
Avg_CCS1_DH6325Tp_F          0
Avg_VW1_WBTbgPs_psi          0
Avg_VW1_WBTbgTp_F            0
Avg_VW1_ANPs_psi             2
Avg_VW1_Z11D4917Ps_psi       0
Avg_VW1_Z11D4917Tp_F         0
Avg_VW1_Z10D5001Ps_psi       0
Avg_VW1_Z10D5001Tp_F         0
Avg_VW1_Z09D5653Ps_psi       0
Avg_VW1_Z09D5653Tp_F         0
Avg_VW1_Z08D5840Ps_psi       0
Avg_VW1_Z08D5840Tp_F         0
Avg_VW1_Z07D6416Ps_psi      30
Avg_VW1_Z07D6416Tp_F        30
Avg_VW1_Z06D6632Ps_psi      30
Avg_VW1_Z06D6632Tp_F        30
Avg_VW1_Z04D6837Ps_psi      32
Avg_VW1_Z04D6837Tp_F        32
Avg_VW1_Z03D6945Ps_psi      32
Avg_VW1_Z03D6945Tp_F        32
Avg_VW1_Z02D6982Ps_psi      32
Avg_VW1_Z02D6982Tp_F        32
Avg_VW1_Z01D7061Ps_psi      32
Avg_VW1_Z01D7061Tp_F        32
Avg_VW1_

In [6]:
def preprocess_datetime_features_interactions(df):
    # Convert to datetime and extract time features
    df['Date Time'] = pd.to_datetime(df['Date Time'], format='%m/%d/%y %H:%M')
    df['hour'] = df['Date Time'].dt.hour
    df['day'] = df['Date Time'].dt.day
    df['month'] = df['Date Time'].dt.month
    df['year'] = df['Date Time'].dt.year
    df['day_of_week'] = df['Date Time'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Forward-fill missing values
    df.fillna(method='ffill', inplace=True)

     # Clean column names (remove extra spaces)
    df.columns = df.columns.str.strip()

    # Create an interaction feature (for example, wellhead pressure * temperature)
    df['pressure_temp_interaction'] = df['Avg_CCS1_WHCO2InjPs_psi'] * df['Avg_CCS1_WHCO2InjTp_F']

    # Rolling mean of CO2 vent rate
    df['rolling_vent_rate'] = df['Avg_PLT_CO2VentRate_TPH'].rolling(window=24).mean()

    #Mean pressure on weekends vs weekdays.
    df['mean_pressure_by_weekend'] = df.groupby('is_weekend')['Avg_CCS1_WHCO2InjPs_psi'].transform('mean')

    #Exponential moving average.
    df['Vent_rate_EMA'] = df['Avg_PLT_CO2VentRate_TPH'].ewm(span = 24).mean()

    #differencing.
    df['pressure_diff'] = df['Avg_CCS1_WHCO2InjPs_psi'].diff()

    # Mean pressure by hou
    df['mean_pressure_by_hour'] = df.groupby('hour')['Avg_CCS1_WHCO2InjPs_psi'].transform('mean')

    # Drop columns not used for modeling (e.g., 'Rand' and 'Date Time')
    df.drop(columns=['Rand', 'Date Time'], inplace=True)

    return df

In [7]:
# Applying preprocessing to both train and test data
train_df = preprocess_datetime_features_interactions(train_df)
test_df = preprocess_datetime_features_interactions(test_df)

  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)


Feature Engineering on Train Data

In [8]:
# Removing initial rows with NaN values from lag features
train_df.dropna(inplace=True)

Prepare Features and Target

In [9]:
X = train_df.drop(columns=['inj_diff'])
y = train_df['inj_diff']

Model Tuning with Time Series Split and GridSearchCV

In [10]:
# Use TimeSeriesSplit for time series–aware cross-validation
tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

grid_search.fit(X, y)

print("Best parameters found:", grid_search.best_params_)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters found: {'colsample_bytree': 0.9, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}


Train Final Model on Full Training Data

In [11]:
best_model = grid_search.best_estimator_
y_pred_train = best_model.predict(X)

In [12]:
# Evaluate on the training set (caution: training metrics may be optimistic)
mse_train = mean_squared_error(y, y_pred_train)
mae_train = mean_absolute_error(y, y_pred_train)
r2_train = r2_score(y, y_pred_train)

In [13]:
print(f"Train MSE: {mse_train}")
print(f"Train MAE: {mae_train}")
print(f"Train R²: {r2_train}")

Train MSE: 4.792992435595749
Train MAE: 0.9514331360966088
Train R²: 0.7314093589358028


 Prepare Test Data and Predict

In [14]:
test_df.head()

Unnamed: 0,Avg_PLT_CO2VentRate_TPH,Avg_CCS1_WHCO2InjPs_psi,Avg_CCS1_WHCO2InjTp_F,Avg_CCS1_ANPs_psi,Avg_CCS1_DH6325Ps_psi,Avg_CCS1_DH6325Tp_F,Avg_VW1_WBTbgPs_psi,Avg_VW1_WBTbgTp_F,Avg_VW1_ANPs_psi,Avg_VW1_Z11D4917Ps_psi,...,month,year,day_of_week,is_weekend,pressure_temp_interaction,rolling_vent_rate,mean_pressure_by_weekend,Vent_rate_EMA,pressure_diff,mean_pressure_by_hour
0,0.0,774.488037,38.82145,549.814482,2988.571802,115.4866,2231.816545,104.111751,2.602844,2073.531816,...,12,2024,1,0,30066.748468,,1135.726042,0.0,,1091.33189
1,0.0,1107.391891,80.355995,582.318649,3240.670603,116.430401,2229.016689,104.330812,0.567504,2073.734361,...,1,2025,5,1,88985.577666,,1167.803427,0.0,332.903854,1106.589741
2,0.0,1325.434039,95.65519,662.075168,3282.452722,127.412783,2231.374359,104.182041,2.672424,2073.576147,...,12,2024,0,0,126784.645204,,1135.726042,0.0,218.042148,1106.589741
3,0.0,1226.120394,88.92643,562.333082,3259.085547,122.714026,2232.036442,104.105361,2.88208,2073.62243,...,12,2024,6,1,109034.509241,,1167.803427,0.0,-99.313645,1194.941688
4,0.0,1155.092736,86.224892,621.546616,3190.583272,122.441536,2229.860781,104.325892,6.20166,2073.723975,...,1,2025,2,0,99597.746238,,1135.726042,0.0,-71.027658,1172.014652


In [15]:
X_test_final = test_df.copy()

In [16]:
y_pred_test = best_model.predict(X_test_final)

In [17]:
print(y_pred_test)

[ 4.34518433e+00  6.23599577e+00  5.19375610e+00 -1.69891701e+01
 -9.95290470e+00  8.10373497e+00 -1.49220028e+01  4.53550136e-03
  3.00159025e+00  8.61289120e+00 -1.38324051e+01  1.14018059e+01
 -3.10645962e+00  5.47529078e+00 -1.40905733e+01  1.12798119e+01
  6.65827617e-02  2.27627730e+00 -9.24185216e-02 -1.40731764e+01
 -8.66785526e+00  5.17742062e+00  6.15509748e-02  8.40087712e-01
 -1.77126005e-01 -7.85887763e-02 -1.54609404e+01  5.79975545e-01
  1.21427011e+01 -1.33335142e+01 -2.76779819e+00  9.69252968e+00
 -1.37557096e+01 -3.55393744e+00  7.61467695e+00  5.47303915e+00
 -2.04000130e+01  8.69861698e+00 -3.78594661e+00 -9.53008747e+00
 -1.03237514e+01  7.85242510e+00 -1.09769486e-01 -5.92011499e+00
  2.75221896e+00  4.85751152e+00  2.52343202e+00 -2.08054805e+00
  7.71066236e+00 -1.36168152e-01 -5.63966894e+00  5.49199724e+00
 -4.08469486e+00  8.81302357e+00 -1.09769486e-01 -5.79617471e-02
  7.10433578e+00  1.05801359e-01 -1.91071868e+00  9.20093632e+00
  5.39468288e+00 -1.28548