In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from pandasql import sqldf
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_excel('C:\\Users\\osungar\\Desktop\projects\\sales_forecasting\\sales_project\\data\\satis_new.xlsx')


In [4]:
query = """
SELECT STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI                   
FROM df
"""

# Run the query
df_clean = sqldf(query, locals())
df_clean[:50]

Unnamed: 0,STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI
0,1743,303.75,KG,7.0,AD,2015-04-01 00:00:00.000000
1,743090,198.0,KG,1.564,B3,2015-04-01 00:00:00.000000
2,1737,1558.5,KG,30.0,AD,2015-04-01 00:00:00.000000
3,1979,268.0,KG,1.0,B3,2015-04-01 00:00:00.000000
4,1736,762.0,KG,23.0,AD,2015-04-01 00:00:00.000000
5,1988,132.0,KG,0.989,B3,2015-04-01 00:00:00.000000
6,1725,1623.15,KG,32.0,AD,2015-04-01 00:00:00.000000
7,1753,301.8,KG,9.0,AD,2015-04-01 00:00:00.000000
8,1942,657.0,KG,4.954,B3,2015-04-01 00:00:00.000000
9,1744,152.85,KG,3.0,AD,2015-04-01 00:00:00.000000


In [5]:
df_clean['SATIS_TARIHI'] = pd.to_datetime(df_clean['SATIS_TARIHI'])

In [6]:
df_date = df_clean.copy()

In [7]:
df_date['YIL'] = df_clean['SATIS_TARIHI'].dt.year
df_date['AY'] = df_clean['SATIS_TARIHI'].dt.month

In [8]:
df_date = df_date.drop(columns=['SATIS_TARIHI'])

In [9]:
df_date_kg = df_date[df_date['OB1']=='KG']

In [10]:
df_date_kg = df_date_kg.drop(columns=['OB1'])

In [11]:
df_date_kg_ad = df_date_kg[df_date_kg['OB2']=='AD']

In [12]:
df_date_kg_ad = df_date_kg_ad.drop(columns=['OB2'])

In [13]:
df_date_kg_ad

Unnamed: 0,STOK_NO,TOPLAM_BIRINCI_OB,TOPLAM_IKINCI_OB,YIL,AY
0,1743,303.75,7.0,2015,4
2,1737,1558.50,30.0,2015,4
4,1736,762.00,23.0,2015,4
6,1725,1623.15,32.0,2015,4
7,1753,301.80,9.0,2015,4
...,...,...,...,...,...
243843,754392,2062.80,40.0,2023,10
243847,2897,51.25,1.0,2023,10
243848,2966,249.85,5.0,2023,10
243849,2980,115.95,5.0,2023,10


In [14]:
query = """
SELECT STOK_NO,AY,YIL,AVG(TOPLAM_IKINCI_OB) as OB2_AYLIK_ORTALAMA,AVG(TOPLAM_BIRINCI_OB) as OB1_AYLIK_ORTALAMA        
FROM df_date_kg_ad
group by STOK_NO,YIL,AY
order by YIL,AY ASC
"""

# Run the query
df_month_kg_ad = sqldf(query, locals())
df_month_kg_ad.head(10)

Unnamed: 0,STOK_NO,AY,YIL,OB2_AYLIK_ORTALAMA,OB1_AYLIK_ORTALAMA
0,1385,4,2015,3.5,75.35
1,1387,4,2015,1.0,25.4
2,1388,4,2015,1.0,24.5
3,1391,4,2015,13.0,418.8
4,1392,4,2015,1.333333,46.866667
5,1394,4,2015,1.0,21.5
6,1407,4,2015,15.0,344.6
7,1408,4,2015,10.0,229.8
8,1410,4,2015,10.0,287.5
9,1412,4,2015,20.0,628.6


In [15]:
df_month_kg_ad = df_month_kg_ad.drop(columns=['OB2_AYLIK_ORTALAMA'])

In [16]:
df_month_kg_ad.tail()

Unnamed: 0,STOK_NO,AY,YIL,OB1_AYLIK_ORTALAMA
35362,758384,10,2023,35.6
35363,758723,10,2023,122.04375
35364,758739,10,2023,145.55
35365,758839,10,2023,570.466667
35366,759196,10,2023,12733.95


In [17]:
categorical_columns=['STOK_NO', 'AY']
for col in categorical_columns:
    df_month_kg_ad[col] = df_month_kg_ad[col].astype('category')
    
#OB1_AYLIK_ORTALAMA is a target
X = df_month_kg_ad.drop(columns=['OB1_AYLIK_ORTALAMA'])
y = df_month_kg_ad['OB1_AYLIK_ORTALAMA']

In [18]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'random_state':42
}

In [19]:
from sklearn.model_selection import KFold

mse_values = []  # Her iterasyonda elde edilen MSE değerlerini saklamak için liste
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=categorical_columns)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # LightGBM Regressor modelini oluştur
    
    
    # Light Gradient Boosting Regressor
    lgb_trained =lgb.train(params,
                lgb_train)

    y_pred =lgb_trained.predict(X_test)
    
    # Performans ölçümü (Örneğin, ortalama karesel hata)
    mse = mean_squared_error(y_test, y_pred)
    print(mse)
    mse_values.append(mse)

# K-Fold Cross Validation sonrasında MSE değerlerinin ortalamasını hesapla
average_mse = sum(mse_values) / len(mse_values)
print(f'Average Mean Squared Error: {average_mse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1324
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 416.062872


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


292827.4378232549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1337
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 416.353780


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


281883.37776754994
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1355
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 416.686999


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


238088.13797210116
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1336
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 414.211504


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


386416.93283590087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1332
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 417.789930


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


366544.21024019894
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1336
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 416.739857
311627.1530990619
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1341
[LightGBM] [Info] Number of data points in the train set: 31830, number of used features: 3
[LightGBM] [Info] Start training from score 418.894890


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


281687.35633957275
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1337
[LightGBM] [Info] Number of data points in the train set: 31831, number of used features: 3
[LightGBM] [Info] Start training from score 415.585547


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


297019.08376743103
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1340
[LightGBM] [Info] Number of data points in the train set: 31831, number of used features: 3
[LightGBM] [Info] Start training from score 416.899504


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


414014.61559581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 31831, number of used features: 3
[LightGBM] [Info] Start training from score 417.841296
233806.70366198465
Average Mean Squared Error: 310391.5009102867


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [21]:
y_pred =lgb_trained.predict(X_test)
merged_array = np.stack((y_test, y_pred), axis=1)
print("         y_test      ,      y_pred")
merged_array[:50]

         y_test      ,      y_pred


array([[ 628.6       ,  521.01194536],
       [2011.35      ,  429.23610681],
       [1318.75      ,  780.55163037],
       [ 743.2       ,  429.23610681],
       [ 335.15      ,  363.12651466],
       [ 303.83      ,  303.65469225],
       [ 259.5       ,  429.23610681],
       [ 507.7       ,  772.81715086],
       [ 351.05      ,  207.92916666],
       [ 513.2       ,  199.62081827],
       [  34.05      ,  105.11579892],
       [ 905.65      , 1285.14812911],
       [2107.3       ,  936.90358786],
       [  83.4       ,  519.68806448],
       [ 127.54545455,  211.28535945],
       [  15.6       ,  429.23610681],
       [ 125.3       ,  415.05591319],
       [  72.03333333,  101.18948099],
       [  33.4       ,   79.26901048],
       [  39.85      ,  134.52408606],
       [ 246.3375    ,  286.54690772],
       [ 110.25      ,  323.24518627],
       [ 211.35      ,   57.91424105],
       [ 224.4       ,  429.23610681],
       [ 205.75      ,  341.51039077],
       [  71.3       ,  1

In [None]:
basic_model_stok -> 310391
basic_model      -> 151859
stok_rolling_ob2 -> 280087
stok_rolling     -> 280038