In [None]:
# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

import xgboost as xgb # The model for forecasting

In [None]:
df = pd.read_csv("train.csv")
df = df.set_index('timestamp')

In [None]:
df.index = pd.to_datetime(df.index, format='ISO8601')
df = df.sort_index()
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df_resampled = df['bk_level'].resample('1h').mean()

df_resampled.plot(style='.', 
                  figsize=(15, 5), 
                  color=color_pal[0], 
                  title='bk_level')

plt.show()

# Feature Creation

In [None]:
df.index

In [None]:
df.columns

In [None]:
def create_features(df):
    df = df.copy()
    df['starttime'] = pd.to_datetime(df['starttime'], format='ISO8601', utc=True)
    df['endtime']   = pd.to_datetime(df['endtime'],   format='ISO8601', utc=True)

    # index = timestamp (UTC)
    ts = df.index
    if not hasattr(ts, 'tz') or ts.tz is None:
        ts = ts.tz_localize('UTC')

    # --- DUZELTME: elapsed_time artik timestamp - starttime (her saniye degisir) ---
    df['elapsed_time']  = (ts - df['starttime']).dt.total_seconds().clip(lower=0)

    # --- YENI: remaining_sec = endtime - timestamp (kalan sure) ---
    df['remaining_sec'] = (df['endtime'] - ts).dt.total_seconds().clip(lower=0)

    # --- YENI: proc_dur = toplam proses suresi (sabit, eski elapsed_time'in karsiligi) ---
    df['proc_dur']      = (df['endtime'] - df['starttime']).dt.total_seconds()

    return df

df = create_features(df)
print('elapsed_time ornekleri (ilk 3, farkli olmalı):')
print(df['elapsed_time'].head(3).values)
print('remaining_sec ornekleri (ilk 3):')
print(df['remaining_sec'].head(3).values)
print('proc_dur (sabit olmali):')
print(df['proc_dur'].head(3).values)

In [None]:
df.columns

# Visualize our Feature / Target Relationship

In [None]:
fig, ax = plt.subplots(figsize=(150, 50))
sns.boxplot(data=df, x='batchkey', y='bk_level')
plt.show()

# Time Series Cross Validation

In [None]:
from sklearn.model_selection import GroupKFold

In [None]:
test_csv = pd.read_csv("test.csv")
test_csv

In [None]:
train = create_features(df)
gkf = GroupKFold(n_splits=5)
df= df.sort_index()

# Lag Features

In [None]:
df['bk_target_level']

In [None]:
df['bk_dosage_valve']

In [None]:
df['dosage_curve_type']

In [None]:
def add_lags(df):
    df['target_lag3'] = df.groupby(['machineid', 'commandno'])['bk_target_level'].shift(3)
    df['target_lag3_diff'] = df['bk_target_level'] - df['target_lag3']
    return df

df = add_lags(df)

In [None]:
df = df.dropna(subset=["target_lag3", "target_lag3_diff"])

In [None]:
# --- DUZELTME: bk_level=0 satírlari puanlanmiyor, filtreliyoruz ---
n_before = len(df)
df = df[df['bk_level'] > 0].copy()
n_after = len(df)
print(f'bk_level=0 filtrelemesi: {n_before:,} -> {n_after:,} satir')
print(f'Cikarilan satir: {n_before - n_after:,} ({(n_before-n_after)/n_before*100:.1f}%)')
print(f'Kalan satirlar: {n_after:,}')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df['proses_id'] = (df.groupby('machineid')['commandno'].diff() != 0).cumsum()

# Train Using Cross Validation

In [None]:
from sklearn.metrics import mean_absolute_error

FEATURES = ['target_lag3', 'kk_target_level', 
            'commandno', 'slow_dosage_valve', 'elapsed_time', 'stepno', 
            'kk_irtibat_valve', 'kk_dosage_valve', 'bk_target_level', 'target_lag3_diff', 
            'ak_level', 'fabric_weight', 'bk_irtibat_valve',
            'remaining_sec', 'proc_dur']

TARGET = 'bk_level'

gkf = GroupKFold(n_splits=5)
fold = 0
scores = []

for train_idx, val_idx in gkf.split(df, groups=df['proses_id']):
      X_train = df.iloc[train_idx][FEATURES]
      y_train = df.iloc[train_idx][TARGET]

      X_test = df.iloc[val_idx][FEATURES]
      y_test = df.iloc[val_idx][TARGET]

      print(f"fold: {fold}")
      
      reg = xgb.XGBRegressor(base_score=0.5, 
                             booster='gbtree', 
                             n_estimators=5000, 
                             objective='reg:absoluteerror',
                             max_depth=3,       
                             learning_rate=0.01,
                             tree_method='hist',
                             device='cuda') # GPU
      
      reg.fit(X_train, y_train, verbose=True)
      y_pred = reg.predict(X_test)

      score = mean_absolute_error(y_test, y_pred)
      scores.append(score)
      fold += 1  

In [None]:
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')

# FEATURE IMPORTANCE

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_, 
                  index=reg.feature_names_in_, 
                  columns=['importance'])

In [None]:
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()

In [None]:
fi = fi.sort_values('importance')
fi

In [None]:
df.columns

In [None]:
# Retrining
df = create_features(df)
df = add_lags(df)

FEATURES = ['target_lag3', 'kk_target_level', 
            'commandno', 'slow_dosage_valve', 'elapsed_time', 'stepno', 
            'kk_irtibat_valve', 'kk_dosage_valve', 'bk_target_level', 'target_lag3_diff', 
            'ak_level', 'fabric_weight', 'bk_irtibat_valve',
            'remaining_sec', 'proc_dur']
TARGET = 'bk_level'

X_all = df[FEATURES]
y_all = df[TARGET]

reg_final = xgb.XGBRegressor(base_score=0.5, 
                             booster='gbtree',    
                             n_estimators=5000, 
                             objective='reg:absoluteerror',
                             max_depth=3,
                             learning_rate=0.01,
                             tree_method='hist',
                             device='cuda') 
reg_final.fit(X_all, y_all, verbose=100)

# TEST

In [None]:
test_csv = pd.read_csv("test.csv")
test_csv = test_csv.set_index('ztimestamp')
test_csv.index = pd.to_datetime(test_csv.index, format='ISO8601')
test_csv = test_csv.sort_index()

In [None]:
test_csv.head()

In [None]:
history_tail = df.tail(5)
test_and_history = pd.concat([history_tail, test_csv])
test_and_history = create_features(test_and_history)
test_and_history = add_lags(test_and_history)
X_test_final = test_and_history.tail(len(test_csv))[FEATURES]
test_predictions = reg_final.predict(X_test_final)

In [None]:
sample_csv = pd.read_csv("sample_submission_sample.csv")
sample_csv.head()

In [None]:
sample_csv['bk_level'] = test_predictions
sample_csv = sample_csv.rename(columns={'row_id' : 'Id', 'bk_level' : 'Predicted'})
sample_csv.to_csv('sample_submission.csv', index=False)

In [None]:
df1 = pd.read_csv("sample_submission.csv")
df1