In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from pandasql import sqldf
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_excel('C:\\Users\\osungar\\Desktop\projects\\sales_forecasting\\sales_project\\data\\satis_new.xlsx')


In [4]:
query = """
SELECT CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI                   
FROM df
"""

# Run the query
df_clean = sqldf(query, locals())
df_clean[:50]

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,OB1,TOPLAM_IKINCI_OB,OB2,SATIS_TARIHI
0,10636,1743,303.75,KG,7.0,AD,2015-04-01 00:00:00.000000
1,10657,743090,198.0,KG,1.564,B3,2015-04-01 00:00:00.000000
2,10636,1737,1558.5,KG,30.0,AD,2015-04-01 00:00:00.000000
3,10657,1979,268.0,KG,1.0,B3,2015-04-01 00:00:00.000000
4,10636,1736,762.0,KG,23.0,AD,2015-04-01 00:00:00.000000
5,10604,1988,132.0,KG,0.989,B3,2015-04-01 00:00:00.000000
6,10636,1725,1623.15,KG,32.0,AD,2015-04-01 00:00:00.000000
7,10636,1753,301.8,KG,9.0,AD,2015-04-01 00:00:00.000000
8,10604,1942,657.0,KG,4.954,B3,2015-04-01 00:00:00.000000
9,10636,1744,152.85,KG,3.0,AD,2015-04-01 00:00:00.000000


In [5]:
df_clean['SATIS_TARIHI'] = pd.to_datetime(df_clean['SATIS_TARIHI'])

In [6]:
df_date = df_clean.copy()

In [7]:
df_date['YIL'] = df_clean['SATIS_TARIHI'].dt.year
df_date['AY'] = df_clean['SATIS_TARIHI'].dt.month

In [8]:
df_date = df_date.drop(columns=['SATIS_TARIHI'])

In [9]:
df_date_kg = df_date[df_date['OB1']=='KG']

In [10]:
df_date_kg = df_date_kg.drop(columns=['OB1'])

In [11]:
df_date_kg.head()

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,TOPLAM_IKINCI_OB,OB2,YIL,AY
0,10636,1743,303.75,7.0,AD,2015,4
1,10657,743090,198.0,1.564,B3,2015,4
2,10636,1737,1558.5,30.0,AD,2015,4
3,10657,1979,268.0,1.0,B3,2015,4
4,10636,1736,762.0,23.0,AD,2015,4


In [12]:
df_date_kg_ad = df_date_kg[df_date_kg['OB2']=='AD']

In [13]:
df_date_kg_ad = df_date_kg_ad.drop(columns=['OB2'])

In [14]:
df_date_kg_ad

Unnamed: 0,CARI_NO,STOK_NO,TOPLAM_BIRINCI_OB,TOPLAM_IKINCI_OB,YIL,AY
0,10636,1743,303.75,7.0,2015,4
2,10636,1737,1558.50,30.0,2015,4
4,10636,1736,762.00,23.0,2015,4
6,10636,1725,1623.15,32.0,2015,4
7,10636,1753,301.80,9.0,2015,4
...,...,...,...,...,...,...
243843,10700,754392,2062.80,40.0,2023,10
243847,10847,2897,51.25,1.0,2023,10
243848,10847,2966,249.85,5.0,2023,10
243849,10847,2980,115.95,5.0,2023,10


In [15]:
query = """
SELECT CARI_NO,STOK_NO,AY,YIL,SUM(TOPLAM_BIRINCI_OB) as OB1_AYLIK_TOPLAM        
FROM df_date_kg_ad
group by CARI_NO,STOK_NO,YIL,AY
order by YIL,AY ASC
"""

# Run the query
df_month_kg_ad = sqldf(query, locals())
df_month_kg_ad.head(10)

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,OB1_AYLIK_TOPLAM
0,10443,4332,4,2015,870.85
1,10443,744535,4,2015,300.8
2,10563,3049,4,2015,61.2
3,10564,2594,4,2015,145.2
4,10564,2595,4,2015,103.9
5,10564,2602,4,2015,92.7
6,10564,2624,4,2015,98.8
7,10564,2626,4,2015,152.55
8,10564,2628,4,2015,218.85
9,10564,2663,4,2015,48.55


In [16]:
# Quarter sütunu oluştur
df_month_kg_ad['MEVSIM'] = pd.cut(df_month_kg_ad['AY'], bins=[0, 3, 6, 9, 12], labels=['q1', 'q2', 'q3', 'q4'])

# DataFrame'i görüntüle
df_month_kg_ad

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,OB1_AYLIK_TOPLAM,MEVSIM
0,10443,4332,4,2015,870.85,q2
1,10443,744535,4,2015,300.80,q2
2,10563,3049,4,2015,61.20,q2
3,10564,2594,4,2015,145.20,q2
4,10564,2595,4,2015,103.90,q2
...,...,...,...,...,...,...
70921,57210,2922,10,2023,229.45,q4
70922,57210,2924,10,2023,485.45,q4
70923,57210,756157,10,2023,39.85,q4
70924,57439,3089,10,2023,170.95,q4


In [17]:
categorical_columns=['CARI_NO', 'STOK_NO', 'AY', 'MEVSIM']
for col in categorical_columns:
    df_month_kg_ad[col] = df_month_kg_ad[col].astype('category')

In [18]:
df_month_kg_ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   CARI_NO           70926 non-null  category
 1   STOK_NO           70926 non-null  category
 2   AY                70926 non-null  category
 3   YIL               70926 non-null  int64   
 4   OB1_AYLIK_TOPLAM  70926 non-null  float64 
 5   MEVSIM            70926 non-null  category
dtypes: category(4), float64(1), int64(1)
memory usage: 1.6 MB


In [19]:
#OB1_AYLIK_ORTALAMA is a target
X = df_month_kg_ad.drop(columns=['OB1_AYLIK_TOPLAM'])
y = df_month_kg_ad['OB1_AYLIK_TOPLAM']

In [20]:
lgb_train = lgb.Dataset(X, y,categorical_feature=categorical_columns)


In [21]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'random_state':42
}

In [22]:
# Light Gradient Boosting Regressor
lgb_trained = lgb.train(params,
                lgb_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1543
[LightGBM] [Info] Number of data points in the train set: 70926, number of used features: 5
[LightGBM] [Info] Start training from score 569.982438


# PREDICTING 01/2024

In [23]:
df_prediction = X.copy()

In [24]:
df_prediction


Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,4,2015,q2
1,10443,744535,4,2015,q2
2,10563,3049,4,2015,q2
3,10564,2594,4,2015,q2
4,10564,2595,4,2015,q2
...,...,...,...,...,...
70921,57210,2922,10,2023,q4
70922,57210,2924,10,2023,q4
70923,57210,756157,10,2023,q4
70924,57439,3089,10,2023,q4


In [36]:
ay = 1
yil = 2024
mevsim = 'q1'

In [25]:
df_prediction['AY'] = ay
df_prediction['YIL'] = yil
df_prediction['MEVSIM'] = mvesim


In [26]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,1,2024,q1
1,10443,744535,1,2024,q1
2,10563,3049,1,2024,q1
3,10564,2594,1,2024,q1
4,10564,2595,1,2024,q1
...,...,...,...,...,...
70921,57210,2922,1,2024,q1
70922,57210,2924,1,2024,q1
70923,57210,756157,1,2024,q1
70924,57439,3089,1,2024,q1


In [27]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CARI_NO  70926 non-null  category
 1   STOK_NO  70926 non-null  category
 2   AY       70926 non-null  int64   
 3   YIL      70926 non-null  int64   
 4   MEVSIM   70926 non-null  object  
dtypes: category(2), int64(2), object(1)
memory usage: 2.0+ MB


In [28]:
categorical_columns=['CARI_NO', 'STOK_NO', 'AY','MEVSIM']
for col in categorical_columns:
    df_prediction[col] = df_prediction[col].astype('category')

In [29]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70926 entries, 0 to 70925
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CARI_NO  70926 non-null  category
 1   STOK_NO  70926 non-null  category
 2   AY       70926 non-null  category
 3   YIL      70926 non-null  int64   
 4   MEVSIM   70926 non-null  category
dtypes: category(4), int64(1)
memory usage: 1.0 MB


In [30]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,1,2024,q1
1,10443,744535,1,2024,q1
2,10563,3049,1,2024,q1
3,10564,2594,1,2024,q1
4,10564,2595,1,2024,q1
...,...,...,...,...,...
70921,57210,2922,1,2024,q1
70922,57210,2924,1,2024,q1
70923,57210,756157,1,2024,q1
70924,57439,3089,1,2024,q1


In [31]:
df_prediction = df_prediction.drop_duplicates(subset=['CARI_NO', 'STOK_NO'])
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM
0,10443,4332,1,2024,q1
1,10443,744535,1,2024,q1
2,10563,3049,1,2024,q1
3,10564,2594,1,2024,q1
4,10564,2595,1,2024,q1
...,...,...,...,...,...
70906,56187,3057,1,2024,q1
70907,56187,3059,1,2024,q1
70908,56187,3061,1,2024,q1
70924,57439,3089,1,2024,q1


In [32]:
y_pred =lgb_trained.predict(df_prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].cat.set_categories(category)


In [33]:
df_prediction['TAHMIN'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction['TAHMIN'] = y_pred


In [34]:
df_prediction

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM,TAHMIN
0,10443,4332,1,2024,q1,760.610947
1,10443,744535,1,2024,q1,817.639173
2,10563,3049,1,2024,q1,53.164098
3,10564,2594,1,2024,q1,135.164315
4,10564,2595,1,2024,q1,99.000172
...,...,...,...,...,...,...
70906,56187,3057,1,2024,q1,183.193092
70907,56187,3059,1,2024,q1,222.443152
70908,56187,3061,1,2024,q1,211.449156
70924,57439,3089,1,2024,q1,192.560122


In [35]:
query = """
SELECT *
FROM df_prediction
order by TAHMIN DESC
"""

# Run the query
last = sqldf(query, locals())
last = last[last["TAHMIN"]>0]
last['TAHMIN'] = last['TAHMIN']*1.1
last

Unnamed: 0,CARI_NO,STOK_NO,AY,YIL,MEVSIM,TAHMIN
0,10660,744812,1,2024,q1,24794.314806
1,28195,744812,1,2024,q1,23809.913295
2,10599,757032,1,2024,q1,23547.580981
3,10660,745057,1,2024,q1,23325.648659
4,45068,1506,1,2024,q1,15798.556088
...,...,...,...,...,...,...
9486,49982,3045,1,2024,q1,1.146806
9487,10569,3037,1,2024,q1,0.622469
9488,53563,742349,1,2024,q1,0.550545
9489,10568,2678,1,2024,q1,0.108803


In [37]:
last.to_excel(f'C:\\Users\\osungar\\Desktop\projects\\sales_forecasting\\sales_project\\data\\predictions\\{yil}_{ay}_predictions.xlsx', index=False)
