In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import pearsonr
import itertools
from statsmodels.tsa.stattools import kpss
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller
import dask.dataframe as dd

In [40]:
test_df = pd.read_csv('test.csv')

In [41]:
test_df.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [42]:
test_df['date'] = pd.to_datetime(test_df['date'])

# Set multi-level index with 'store_nbr', 'family', and 'date'
test_df = test_df.set_index(['store_nbr', 'family', 'date'])

# Calculate the mean of 'onpromotion' for each week within each group
test_df = test_df['onpromotion'].groupby([pd.Grouper(level='store_nbr'), pd.Grouper(level='family'), pd.Grouper(level='date', freq='W')]).mean().reset_index()


In [44]:
onpromotion_min =test_df['onpromotion'].min()
onpromotion_max = test_df['onpromotion'].max()

print(f"Range of onpromotion values: [{onpromotion_min}, {onpromotion_max}]")

Range of onpromotion values: [0.0, 206.25]


In [45]:
test_df.head()

Unnamed: 0,store_nbr,family,date,onpromotion
0,1,AUTOMOTIVE,2017-08-20,0.0
1,1,AUTOMOTIVE,2017-08-27,0.0
2,1,AUTOMOTIVE,2017-09-03,0.0
3,1,BABY CARE,2017-08-20,0.0
4,1,BABY CARE,2017-08-27,0.0


In [46]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

test_df = test_df.rename(columns={'family': 'product_category'})

encoder = LabelEncoder()
# Encode categorical variables
test_df['store_nbr'] = encoder.fit_transform(test_df['store_nbr'])
test_df['product_category'] = encoder.fit_transform(test_df['product_category'])




In [47]:
test_df.head()

Unnamed: 0,store_nbr,product_category,date,onpromotion
0,0,0,2017-08-20,0.0
1,0,0,2017-08-27,0.0
2,0,0,2017-09-03,0.0
3,0,1,2017-08-20,0.0
4,0,1,2017-08-27,0.0


In [49]:
# Extract year, month, and day from the 'date' column
test_df['year'] = pd.to_datetime(test_df['date']).dt.year
test_df['month'] = pd.to_datetime(test_df['date']).dt.month
test_df['day'] = pd.to_datetime(test_df['date']).dt.day

# Drop the 'date' and 'id' columns
test_df = test_df.drop(['date'], axis=1)

In [50]:
# Scale numerical features

scaler = StandardScaler()

numerical_cols = ['onpromotion', 'year', 'month', 'day']
test_df[numerical_cols] = scaler.fit_transform(test_df[numerical_cols])

In [51]:
test_df.head()

Unnamed: 0,store_nbr,product_category,onpromotion,year,month,day
0,0,0,-0.505838,0.0,-0.707107,0.330771
1,0,0,-0.505838,0.0,-0.707107,1.025389
2,0,0,-0.505838,0.0,1.414214,-1.356159
3,0,1,-0.505838,0.0,-0.707107,0.330771
4,0,1,-0.505838,0.0,-0.707107,1.025389


In [52]:
test_df.to_csv('test_new.csv', index=False)
