In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from pandasql import sqldf
from sklearn.metrics import mean_squared_error

In [99]:
pd.set_option('display.max_columns', None)

In [100]:
df = pd.read_excel('C:\\Users\\osungar\\Desktop\projects\\sales_forecasting\\sales_project\\data\\satis_new.xlsx')


In [101]:
query = """
SELECT CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI                   
FROM df
"""

# Run the query
df_clean = sqldf(query, locals())
df_clean[:50]

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI
0,10636,1743,303.75,KG,7.0,AD,2015-04-01 00:00:00.000000
1,10657,743090,198.0,KG,1.564,B3,2015-04-01 00:00:00.000000
2,10636,1737,1558.5,KG,30.0,AD,2015-04-01 00:00:00.000000
3,10657,1979,268.0,KG,1.0,B3,2015-04-01 00:00:00.000000
4,10636,1736,762.0,KG,23.0,AD,2015-04-01 00:00:00.000000
5,10604,1988,132.0,KG,0.989,B3,2015-04-01 00:00:00.000000
6,10636,1725,1623.15,KG,32.0,AD,2015-04-01 00:00:00.000000
7,10636,1753,301.8,KG,9.0,AD,2015-04-01 00:00:00.000000
8,10604,1942,657.0,KG,4.954,B3,2015-04-01 00:00:00.000000
9,10636,1744,152.85,KG,3.0,AD,2015-04-01 00:00:00.000000


In [102]:
df_clean['SATIS_TARIHI'] = pd.to_datetime(df_clean['SATIS_TARIHI'])

In [103]:
df_date = df_clean.copy()

In [104]:
df_date['YIL'] = df_clean['SATIS_TARIHI'].dt.year
df_date['AY'] = df_clean['SATIS_TARIHI'].dt.month

In [105]:
df_date = df_date.drop(columns=['SATIS_TARIHI'])

In [106]:
df_date_kg = df_date[df_date['OB1']=='KG']

In [107]:
df_date_kg = df_date_kg.drop(columns=['OB1'])

In [108]:
df_date_kg.head()

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,TOPLAM_IKINCI_OB,OB2,YIL,AY
0,10636,1743,303.75,7.0,AD,2015,4
1,10657,743090,198.0,1.564,B3,2015,4
2,10636,1737,1558.5,30.0,AD,2015,4
3,10657,1979,268.0,1.0,B3,2015,4
4,10636,1736,762.0,23.0,AD,2015,4


In [109]:
df_date_kg_ad = df_date_kg[df_date_kg['OB2']=='AD']

In [110]:
df_date_kg_ad = df_date_kg_ad.drop(columns=['OB2'])

In [111]:
df_date_kg_ad

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,TOPLAM_IKINCI_OB,YIL,AY
0,10636,1743,303.75,7.0,2015,4
2,10636,1737,1558.50,30.0,2015,4
4,10636,1736,762.00,23.0,2015,4
6,10636,1725,1623.15,32.0,2015,4
7,10636,1753,301.80,9.0,2015,4
...,...,...,...,...,...,...
243843,10700,754392,2062.80,40.0,2023,10
243847,10847,2897,51.25,1.0,2023,10
243848,10847,2966,249.85,5.0,2023,10
243849,10847,2980,115.95,5.0,2023,10


In [112]:
query = """
SELECT CARI_NO,STOK_NO,AY,YIL,SUM(TOPLAM_BIRINCI_OB) as OB1_AYLIK_TOPLAM        
FROM df_date_kg_ad
group by CARI_NO,STOK_NO,YIL,AY
order by YIL,AY ASC
"""

# Run the query
df_month_kg_ad = sqldf(query, locals())
df_month_kg_ad.head(10)

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,OB1_AYLIK_TOPLAM
0,10443,4332,4,2015,870.85
1,10443,744535,4,2015,300.8
2,10563,3049,4,2015,61.2
3,10564,2594,4,2015,145.2
4,10564,2595,4,2015,103.9
5,10564,2602,4,2015,92.7
6,10564,2624,4,2015,98.8
7,10564,2626,4,2015,152.55
8,10564,2628,4,2015,218.85
9,10564,2663,4,2015,48.55


In [113]:
# Quarter sütunu oluştur
df_month_kg_ad['MEVSIM'] = pd.cut(df_month_kg_ad['AY'], bins=[0, 3, 6, 9, 12], labels=['q1', 'q2', 'q3', 'q4'])

# DataFrame'i görüntüle
df_month_kg_ad

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,OB1_AYLIK_TOPLAM,MEVSIM
0,10443,4332,4,2015,870.85,q2
1,10443,744535,4,2015,300.80,q2
2,10563,3049,4,2015,61.20,q2
3,10564,2594,4,2015,145.20,q2
4,10564,2595,4,2015,103.90,q2
...,...,...,...,...,...,...
70921,57210,2922,10,2023,229.45,q4
70922,57210,2924,10,2023,485.45,q4
70923,57210,756157,10,2023,39.85,q4
70924,57439,3089,10,2023,170.95,q4


In [114]:
categorical_columns=['CARI_NO', 'STOK_NO', 'AY', 'MEVSIM']
for col in categorical_columns:
    df_month_kg_ad[col] = df_month_kg_ad[col].astype('category')

In [115]:
df_month_kg_ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   CARI_NO           70926 non-null  category
 1   STOK_NO           70926 non-null  category
 2   AY                70926 non-null  category
 3   YIL               70926 non-null  int64   
 4   OB1_AYLIK_TOPLAM  70926 non-null  float64 
 5   MEVSIM            70926 non-null  category
dtypes: category(4), float64(1), int64(1)
memory usage: 1.6 MB


In [116]:
#OB1_AYLIK_ORTALAMA is a target
X = df_month_kg_ad.drop(columns=['OB1_AYLIK_TOPLAM'])
y = df_month_kg_ad['OB1_AYLIK_TOPLAM']

In [118]:
X

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,4,2015,q2
1,10443,744535,4,2015,q2
2,10563,3049,4,2015,q2
3,10564,2594,4,2015,q2
4,10564,2595,4,2015,q2
...,...,...,...,...,...
70921,57210,2922,10,2023,q4
70922,57210,2924,10,2023,q4
70923,57210,756157,10,2023,q4
70924,57439,3089,10,2023,q4


------

# Without split

In [134]:
lgb_train = lgb.Dataset(X, y,categorical_feature=categorical_columns)


In [135]:
# Light Gradient Boosting Regressor
lgb_trained = lgb.train(params,
                lgb_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1543
[LightGBM] [Info] Number of data points in the train set: 70926, number of used features: 5
[LightGBM] [Info] Start training from score 569.982438


-------


# With split

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [85]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56740 entries, 69753 to 68268
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CARI_NO  56740 non-null  category
 1   STOK_NO  56740 non-null  category
 2   AY       56740 non-null  category
 3   YIL      56740 non-null  int64   
 4   MEVSIM   56740 non-null  category
dtypes: category(4), int64(1)
memory usage: 1.3 MB


In [86]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=categorical_columns)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [87]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'random_state':42
}

In [88]:
# Light Gradient Boosting Regressor
lgb_trained = lgb.train(params,
                lgb_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1473
[LightGBM] [Info] Number of data points in the train set: 56740, number of used features: 5
[LightGBM] [Info] Start training from score 572.616448


In [89]:
y_pred =lgb_trained.predict(X_test)


In [90]:
print('MSE score on train data:')
print(mean_squared_error(y_test,y_pred))

MSE score on train data:
473863.1197568319


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [96]:
merged_array = np.stack((y_test, y_pred), axis=1)


In [97]:
print("         y_test      ,      y_pred")
merged_array[:70]

         y_test      ,      y_pred


array([[1.32410000e+03, 1.07518364e+03],
       [3.27150000e+02, 6.10509463e+02],
       [3.55050000e+02, 2.47200502e+02],
       [2.73100000e+02, 4.24939156e+02],
       [4.75000000e+00, 1.68535166e+02],
       [1.06115000e+03, 3.52383382e+03],
       [4.92500000e+01, 3.47137817e+02],
       [1.14000000e+02, 1.79574487e+02],
       [5.95000000e+01, 8.81844321e+02],
       [5.83850000e+02, 6.57994054e+02],
       [5.54500000e+01, 7.98182305e+01],
       [4.32765000e+03, 2.73964636e+03],
       [4.19050000e+02, 4.77556973e+02],
       [6.15000000e+02, 2.75095825e+02],
       [4.51950000e+02, 4.38651282e+02],
       [5.26000000e+01, 2.15049286e+02],
       [4.00500000e+01, 6.13914409e+01],
       [3.65150000e+02, 1.79441859e+02],
       [6.39750000e+02, 1.16537650e+03],
       [1.14435000e+03, 5.60053472e+02],
       [3.95000000e+02, 3.12448700e+02],
       [9.30000000e+01, 2.52802337e+02],
       [1.31325000e+03, 5.65023333e+02],
       [1.52500000e+02, 2.33494555e+02],
       [2.374000

----

# KFold 

In [47]:
from sklearn.model_selection import KFold

mse_values = []  # Her iterasyonda elde edilen MSE değerlerini saklamak için liste
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=categorical_columns)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # LightGBM Regressor modelini oluştur
    # Light Gradient Boosting Regressor
    lgb_trained =lgb.train(params,
                lgb_train)

    y_pred =lgb_trained.predict(X_test)
    
    # Performans ölçümü (Örneğin, ortalama karesel hata)
    mse = mean_squared_error(y_test, y_pred)
    print(mse)
    mse_values.append(mse)

# K-Fold Cross Validation sonrasında MSE değerlerinin ortalamasını hesapla
average_mse = sum(mse_values) / len(mse_values)
print(f'Average Mean Squared Error: {average_mse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 569.877154


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


562940.5287056045
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 568.263710


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


603398.7749000008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1509
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 567.407381


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


560837.8192229017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1506
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 570.936384


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


671050.7899889837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1528
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 569.713064


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


486106.01052017085
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 63833, number of used features: 5
[LightGBM] [Info] Start training from score 570.070994


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


717680.7193412329
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 63834, number of used features: 5
[LightGBM] [Info] Start training from score 569.144180


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


670731.0642917532
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 63834, number of used features: 5
[LightGBM] [Info] Start training from score 570.000731


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


526205.1243683873
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1515
[LightGBM] [Info] Number of data points in the train set: 63834, number of used features: 5
[LightGBM] [Info] Start training from score 572.050651


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


420374.4676675217
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1514
[LightGBM] [Info] Number of data points in the train set: 63834, number of used features: 5
[LightGBM] [Info] Start training from score 572.360071
459918.0492644436
Average Mean Squared Error: 567924.3348271002


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


# PREDICTING 11/2023


In [136]:
df_prediction = X.copy()

In [137]:
df_prediction


Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,4,2015,q2
1,10443,744535,4,2015,q2
2,10563,3049,4,2015,q2
3,10564,2594,4,2015,q2
4,10564,2595,4,2015,q2
...,...,...,...,...,...
70921,57210,2922,10,2023,q4
70922,57210,2924,10,2023,q4
70923,57210,756157,10,2023,q4
70924,57439,3089,10,2023,q4


In [138]:
df_prediction['AY'] = 11
df_prediction['YIL'] = 2023
df_prediction['MEVSIM'] = 'q4'


In [139]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CARI_NO  70926 non-null  category
 1   STOK_NO  70926 non-null  category
 2   AY       70926 non-null  int64   
 3   YIL      70926 non-null  int64   
 4   MEVSIM   70926 non-null  object  
dtypes: category(2), int64(2), object(1)
memory usage: 2.0+ MB


In [140]:
categorical_columns=['CARI_NO', 'STOK_NO', 'AY','MEVSIM']
for col in categorical_columns:
    df_prediction[col] = df_prediction[col].astype('category')

In [141]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CARI_NO  70926 non-null  category
 1   STOK_NO  70926 non-null  category
 2   AY       70926 non-null  category
 3   YIL      70926 non-null  int64   
 4   MEVSIM   70926 non-null  category
dtypes: category(4), int64(1)
memory usage: 1.0 MB


In [142]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,11,2023,q4
1,10443,744535,11,2023,q4
2,10563,3049,11,2023,q4
3,10564,2594,11,2023,q4
4,10564,2595,11,2023,q4
...,...,...,...,...,...
70921,57210,2922,11,2023,q4
70922,57210,2924,11,2023,q4
70923,57210,756157,11,2023,q4
70924,57439,3089,11,2023,q4


In [143]:
df_prediction = df_prediction.drop_duplicates(subset=['CARI_NO', 'STOK_NO'])
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,11,2023,q4
1,10443,744535,11,2023,q4
2,10563,3049,11,2023,q4
3,10564,2594,11,2023,q4
4,10564,2595,11,2023,q4
...,...,...,...,...,...
70906,56187,3057,11,2023,q4
70907,56187,3059,11,2023,q4
70908,56187,3061,11,2023,q4
70924,57439,3089,11,2023,q4


In [144]:
y_pred =lgb_trained.predict(df_prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].cat.set_categories(category)


In [145]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,11,2023,q4
1,10443,744535,11,2023,q4
2,10563,3049,11,2023,q4
3,10564,2594,11,2023,q4
4,10564,2595,11,2023,q4
...,...,...,...,...,...
70906,56187,3057,11,2023,q4
70907,56187,3059,11,2023,q4
70908,56187,3061,11,2023,q4
70924,57439,3089,11,2023,q4


In [146]:
df_prediction['TAHMIN'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction['TAHMIN'] = y_pred


In [147]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM,TAHMIN
0,10443,4332,11,2023,q4,737.455690
1,10443,744535,11,2023,q4,818.503973
2,10563,3049,11,2023,q4,69.410647
3,10564,2594,11,2023,q4,138.262650
4,10564,2595,11,2023,q4,160.465037
...,...,...,...,...,...,...
70906,56187,3057,11,2023,q4,188.812200
70907,56187,3059,11,2023,q4,226.361382
70908,56187,3061,11,2023,q4,217.559533
70924,57439,3089,11,2023,q4,209.297941


In [148]:
query = """
SELECT *
FROM df_prediction
order by TAHMIN DESC
"""

# Run the query
last = sqldf(query, locals())
last = last[last["TAHMIN"]>0]
last['TAHMIN'] = last['TAHMIN']*1.1
last

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM,TAHMIN
0,10660,744812,11,2023,q4,26311.949119
1,28195,744812,11,2023,q4,25318.438150
2,10660,745057,11,2023,q4,25065.467816
3,10599,757032,11,2023,q4,24633.230828
4,10651,1506,11,2023,q4,17550.817426
...,...,...,...,...,...,...
9496,11032,4501,11,2023,q4,2.878049
9497,10566,3037,11,2023,q4,2.830609
9498,10571,2829,11,2023,q4,0.936499
9499,10564,3049,11,2023,q4,0.513771


In [150]:
last.to_excel('C:\\Users\\osungar\\Desktop\projects\\sales_forecasting\\sales_project\\data\\predictions\\predictions.xlsx', index=False)


In [None]:
basic_model_stok -> 310391
basic_model      -> 151859
stok_rolling_ob2 -> 280087
stok_rolling     -> 280038
                    567924
                    344742.9263017369
