In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Model Building and Data Analysis
# 1.Model Oluşturma ve Veri Analizi

#### İmport Necessary Library
#### Gerekli Kütüphaneleri Yükle

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
train_df

In [None]:
train_df.shape

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'])

#### Veriyi daha iyi anlamak için çeşitli görselleştirmeler yap
#### Make various visualizations to better understand the data

In [None]:
sns.histplot(np.log1p(train_df['sales']), kde=True)

#### by monthly sales according to store_nbr
#### store_nbr'ye göre aylık satışla

In [None]:
# Her bir ay için toplam satışları depo numarasına göre grupla
store_nbr_sales_per_month = train_df.groupby(['date', 'store_nbr']).agg({'sales': 'sum'}).unstack().resample('1M').sum()

# Satışları çiz
store_nbr_sales_per_month.plot(figsize=(21,5))
plt.legend(bbox_to_anchor=(0.9, -0.2), ncol=9)
plt.title("Depo numarasına göre aylık satışlar")


#### Monthly sales by family
#### Aileye göre aylık satışlar

In [None]:
# Her bir ay için toplam satışları ürün ailesine göre grupla
family_sales_per_month = train_df.groupby(['date', 'family']).agg({'sales': 'sum'}).unstack().resample('1M').sum()

# Satışları çiz
family_sales_per_month.plot(figsize=(21,5))
plt.legend(bbox_to_anchor=(1, -.2), ncol=6)
plt.title("Ürün ailesine göre aylık satışlar")


In [None]:
sns.histplot(data=train_df, x='onpromotion')

In [None]:
# train_df'nin satış sütununda log dönüşümü uygula
train_df['sales'] = np.log1p(train_df['sales'])

# stores.csv dosyasını oku ve ilk birkaç satırını göster
stores_df = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')
stores_df.head()


In [None]:
stores_df.shape

In [None]:
transactions_df = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv')
transactions_df.head()

In [None]:
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
transactions_df.shape


#### Monthly transactions by store_nbr
#### Store_nbrye göre aylık işlemler

In [None]:
# Her bir ay için toplam işlem sayısını depo numarasına göre grupla
store_nbr_transactions_per_month = transactions_df.groupby(['date', 'store_nbr']).agg({'transactions': 'sum'}).unstack().resample('1M').sum()

# İşlem sayılarını çiz
store_nbr_transactions_per_month.plot(figsize=(21,5))
plt.legend(bbox_to_anchor=(0.9,-0.2), ncol=7)
plt.title('Depo numarasına göre aylık işlemler')

In [None]:
test_df = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv')
test_df.head()

In [None]:
test_id = test_df['id']
test_df['date'] = pd.to_datetime(test_df['date'])

In [None]:
test_df.shape

In [None]:
holidays_df = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')
holidays_df.head()

In [None]:
holidays_df['date'] = pd.to_datetime(holidays_df['date'])

In [None]:
holidays_df.shape

In [None]:
oil_df = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
oil_df.head()

In [None]:
# 'dcoilwtico' sütunundaki eksik değerleri arkadan doldur
oil_df['dcoilwtico'].fillna(method='backfill', inplace=True)

# 'date' sütununu datetime tipine dönüştür
oil_df['date'] = pd.to_datetime(oil_df['date'])

# oil_df'nin boyutunu yazdır
oil_df.shape

In [None]:
sns.lineplot(data=oil_df, x='date', y='dcoilwtico')
plt.title('Petrol fiyatı')

#### Fonksiyon yardımıyla hızlı veri okuma 
#### Fast data reading with the help of the function

In [None]:
def lag_features(df, lags):
    for lag in lags:
        # 'store_nbr' ve 'family' sütunlarına göre 'sales' sütununun önceki değerlerini alarak 'sales_t-lag' sütununu oluştur
        df[f"sales_t-{lag}"] = df.groupby(["store_nbr", "family"])["sales"].transform(
            lambda x: x.shift(lag))
    return df


In [None]:
def fill_na(df):
    # 'holiday_type' sütunundaki eksik değerleri 'Common' ile doldur
    df['holiday_type'] = df['holiday_type'].fillna('Common')
    # 'locale' sütunundaki eksik değerleri 'Common' ile doldur
    df['locale'] = df['locale'].fillna('Common')
    # 'description' sütunundaki eksik değerleri 'Unknown' ile doldur
    df['description'] = df['description'].fillna('Unknown')
    # 'transferred' sütunundaki eksik değerleri False ile doldur
    df['transferred'] = df['transferred'].fillna(False)
    # 'dcoilwtico' sütunundaki eksik değerleri arkadan doldur
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='backfill')
    return df


In [None]:
def merge_data(df):
    df = df \
        .merge(stores_df, left_on="store_nbr", right_on="store_nbr", how="left") \
        .rename(columns={"type": "store_type"}) \
        .merge(transactions_df, left_on=["date", "store_nbr"], right_on=["date", "store_nbr"], how="left") \
        .merge(holidays_df, left_on="date", right_on="date", how="left") \
        .drop_duplicates(subset="id") \
        .rename(columns={"type": "holiday_type"}) \
        .merge(oil_df, left_on="date", right_on="date", how="left") 
    return df


In [None]:
def create_date_features(df):
    df['month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['day_of_week'] = df['date'].dt.dayofweek
    df['year'] = df['date'].dt.year
    return df


In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=2.0, size=(len(dataframe),))

In [None]:
def roll_mean_features(dataframe, windows):
    def add_noise(x):
        return x + np.random.normal(size=len(x))

    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store_nbr", "family"])['sales']. \
                                                          transform(
            lambda x: x.shift(16).rolling(window=window, min_periods=7, win_type="triang").mean())
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store_nbr", "family"])['sales_roll_mean_' + str(window)].transform(add_noise)
    return dataframe


In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            feature_name = 'sales_ewm_alpha_' + str(alpha).replace(".", "") + '_lag_' + str(lag)
            dataframe[feature_name] = dataframe.groupby(["store_nbr", "family"])['sales'].\
                transform(lambda x: x.shift(lag).ewm(alpha=alpha, min_periods=1).mean())
    return dataframe


In [None]:
col = [
    'date',
    'store_nbr', 
    'family', 
    'sales', 
    'onpromotion', 
    'cluster', 
    'holiday_type', 
    'locale', 
    'description', 
    'transferred',
    'dcoilwtico'
]

In [None]:
all_df = pd.concat([train_df, test_df], axis=0)
all_df = merge_data(all_df)
all_df = fill_na(all_df)
all_df = create_date_features(all_df)
all_df = lag_features(all_df, lags=[*range(1, 16), 16, 17, 18, 19, 20, 21, 22, 30, 31, 90, 180, 364])
all_df = roll_mean_features(all_df, [16, 17, 18, 30])
all_df = all_df[col]


In [None]:
alphas = [0.95, 0.9, 0.8, 0.5]
lags =[1, 7,30]
all_df = ewm_features(all_df, alphas, lags)

In [None]:
all_df['store_nbr'] = all_df['store_nbr'].astype('category')
all_df['family'] = all_df['family'].astype('category')
all_df['store_nbr'] = all_df['store_nbr'].astype('category')
all_df['cluster'] = all_df['cluster'].astype('category')
all_df['family'] = all_df['family'].astype('category')
all_df['holiday_type'] = all_df['holiday_type'].astype('category')
all_df['locale'] = all_df['locale'].astype('category')
all_df['description'] = all_df['description'].astype('category')

In [None]:
all_df

In [None]:
all_df = all_df[all_df['date'] > '2013-12-31']
X = all_df[all_df['date'] <= '2017-08-15'].drop('date', axis=1)
X_test = all_df[all_df['date'] > '2017-08-15'].drop('date', axis=1)


In [None]:
corr = X.corr()
corr['sales'].sort_values(ascending=False)

In [None]:
Y = X['sales']
X = X.drop(['sales'], axis=1)
X_test = X_test.drop(['sales'], axis=1)


In [None]:
X.shape, Y.shape, X_test.shape

In [None]:
#!pip install lightgbm
#!pip install catboost

# Model Building And Data Analysis 
# Model Oluşturma ve Veri Analizi

## İmport Necessary Library
## Gerekli Kütüphaneleri Yükle

In [None]:
import numpy as np
import pandas as pd
from random import random
import os

from sklearn.metrics import mean_squared_log_error as msle
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm

from matplotlib import pyplot as plt, style
style.use('seaborn-darkgrid')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
from tqdm import tqdm

from pandas_profiling import ProfileReport

import gc
gc.enable()
from warnings import filterwarnings, simplefilter
filterwarnings('ignore')
simplefilter('ignore')

In [None]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv', parse_dates = ['date'], infer_datetime_format = True, index_col=['id'])

In [None]:
test['date'] = test.date.dt.to_period('D')

In [None]:
df_pred = test.copy()

#### Return files quickly with function
#### Fonksiyon ile dosyaları hızlı şekilde dön

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/store-sales-submissions'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df_pred['sub_'+filename.split('_')[1].split('.')[0]] = pd.read_csv(os.path.join(dirname, filename), index_col=['id']).sales


In [None]:
df_pred.head()

#### Pandas Profil ile kolayca verileri incele 
#### Examine data easily with Pandas Profile

In [None]:
test_profile = ProfileReport(test, title="test_profile")
test_profile.to_notebook_iframe()

In [None]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv', parse_dates = ['date'], infer_datetime_format = True, index_col=['id'])
test['date'] = test.date.dt.to_period('D')

df_pred = test.copy()

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/store-sales-submissions'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df_pred['sub_'+filename.split('_')[1].split('.')[0]] = pd.read_csv(os.path.join(dirname, filename), index_col=['id']).sales

df_pred.head()

#### all submissions are highly correlated
#### tüm gönderimler yüksek oranda ilişkilidir

In [None]:
res = df_pred.drop(['store_nbr','onpromotion'], axis=1).groupby('family').mean()
res_corr = res.corr(method="spearman")
sns.heatmap(res_corr, annot=True, fmt='.1f', cmap='coolwarm', square=True, mask=np.triu(res_corr), linewidths=1, cbar=False)

#### Upgini only notebooks
#### Sadece Upgini not defterleri

In [None]:
pred_cols = ['sub_39331','sub_39359','sub_39274','sub_39369','sub_39347','sub_39263','sub_39281','sub_39261','sub_39262','sub_39266']

In [None]:
def func(dt):
    pred_res = pd.Series()
    for col in pred_cols:
        pred_res[col] = np.sqrt(np.sqrt(msle(dt[col], dt['sub_38558'])))
    return pred_res

####  "min" - is the column name with the predicted values closest to the best Darts "38558" submission
####  "min" - en iyi Dart "38558" gönderimine en yakın tahmin edilen değerlere sahip sütun adıdır

In [None]:
res = df_pred.groupby('family').apply(func)
res['min'] = res.idxmin(axis=1)
res

In [None]:
res['min'].value_counts()

In [None]:
res = df_pred.groupby('date')[['sub_38415','sub_39331','sub_39359','sub_38558']].mean()
res.plot(figsize=(20,10))

#### stack the closest 3 submissions ['sub_39331','sub_39359','sub_38558'] - has the best score!
#### en yakın 3 gönderimi bir araya getirin ['sub_39331','sub_39359','sub_38558'] - en iyi skora sahip!

In [None]:
sub = pd.DataFrame(df_pred[['sub_39331','sub_39359','sub_38558']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_38558.csv', index = False)

#### en yakın 3 gönderimi yığın ['sub_38415','sub_39331','sub_39359'] - diğer 2 yakın çözümle önceki en iyi skor
#### stack the closest 3 submissions ['sub_38415','sub_39331','sub_39359'] - the previous best score with 2 other closest solutions

In [None]:
sub = pd.DataFrame(df_pred[['sub_38415','sub_39331','sub_39359']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_38415.csv', index = False)

#### en iyi 2 gönderimi istifleyin 
#### stack the 2 best submissions


In [None]:
sub = pd.DataFrame(df_pred[['sub_38415','sub_38558']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission.csv', index = False)

In [None]:
sub_cols = pred_cols + ['sub_38558']

sub = pd.DataFrame(df_pred[sub_cols].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_mean.csv', index = False)

#### mevcut tüm gönderimleri istifleyin
#### stack all existing submissions 


In [None]:
sub = pd.DataFrame(df_pred[pred_cols].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission2_pred.csv', index = False)