In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
train=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
train.head()

In [None]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv',usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float64',},parse_dates=['date'],infer_datetime_format=True)

In [None]:
train.head()

In [None]:
test=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')

In [None]:
test.head()

In [None]:
stores=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')

In [None]:
oil=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')

In [None]:
holidays=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

In [None]:
transactions=pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

In [None]:
test.shape

In [None]:
df=pd.concat([train,test])
df.drop(columns='id',inplace=True)


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['date']=pd.to_datetime(df['date'])

In [None]:
df.info()

In [None]:
categorical=['store_nbr','family']

In [None]:
for col in categorical:
    print(f"unique: {col}: {df[col].nunique()}")

In [None]:
#since date is in datetime format
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day
df['weekday']=df['date'].dt.dayofweek
df['weekend']=df['weekday'].isin([5,6]).astype(int)




In [None]:
df.head()

In [None]:
df.info()

In [None]:
plt.plot(df['date'],df['sales'],label='Sales')

In [None]:
plt.scatter(df['onpromotion'],df['sales'],alpha=0.5)

In [None]:
p=df.groupby('month')['sales'].mean()
p.plot(kind='bar')
plt.title('sales by month')

In [None]:
df['is_holiday']=df['date'].isin(holidays['date']).astype(int)
q=df.groupby('is_holiday')['sales'].mean()
q.plot(kind='bar')
plt.title('Non holiday                Holiday')

In [None]:
for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(df[name].median())
        
for name in df.select_dtypes("object"):
        df[name] = df[name].fillna("None")
        
# Drop the original datetime column
df = df.drop('date', axis=1)

In [None]:
df=pd.get_dummies(df,columns=categorical, drop_first=True)

In [None]:
df

In [None]:
df.shape

In [None]:
train=df.loc[train.index, :]
test=df.loc[test.index, :]
test.drop('sales',axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_log_error

In [None]:
clean=train.dropna(subset=['sales']) #dropping missing 

In [None]:
features=clean.columns.difference(['date'])
X=clean[features].values
y=clean['sales']

In [None]:
scaler=StandardScaler()
X=scaler.fit_transform(X)

X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
#reshaping is essential for LSTM as it expects a 3 D input
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_valid = X_valid.reshape((X_valid.shape[0], 1, X_valid.shape[1]))


In [None]:
# X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
# X_valid = X_valid.reshape((X_valid.shape[0], 1, X_valid.shape[1]))

In [None]:
from sklearn.preprocessing import MinMaxScaler


In [None]:

y_scaler = MinMaxScaler()

# Assuming y_train and y_valid are Pandas Series
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_valid = y_scaler.transform(y_valid.values.reshape(-1, 1))


In [None]:
model=Sequential()
model.add(LSTM(50,input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer=Adam(lr=0.001))


In [None]:
model.fit(X_train,y_train,epochs=15,batch_size=512,validation_data=(X_valid,y_valid),verbose=1)

In [None]:
preds=model.predict(X_valid)

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 196)) # Using sales 25% to 75% for results 
preds = min_max_scaler.fit_transform(preds)
preds

In [None]:
preds_flat = preds.flatten()
y_valid_flat = y_valid.flatten()

# Evaluate the LSTM model using RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_valid_flat,preds_flat))
print("LSTM RMSLE:", rmsle)

In [None]:
test['sales'] = 0 

# Extract features from df_test
X_test = test[features].values

# Normalize the features using the same scaler used for training data
X_test = scaler.transform(X_test)



In [None]:
# Reshape data for LSTM
# Assuming 'sequence_length' is the length of your sequences
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)



In [None]:
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))


In [None]:
# Make predictions on the test set
predictions = model.predict(X_test)


In [None]:
predictions

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming 'predictions' is your original array after the log transformation
min_max_scaler = MinMaxScaler(feature_range=(0, 196)) 
predictions_scaled = min_max_scaler.fit_transform(predictions)
predictions_scaled = np.log1p(predictions_scaled)
predictions_scaled = np.exp(predictions_scaled)
predictions_scaled

In [None]:
output = pd.DataFrame({'id': test.index, 'sales': predictions_scaled.flatten()})
output = output.drop_duplicates(subset=['id'], keep='last')
output['id'] = output['id'] + 3000888
output = output.groupby('id')['sales'].mean().reset_index()
output.to_csv('submission.csv', index=False)
output