In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os


In [6]:
# Load the data
data = pd.read_csv('data_no_outliers.csv')

In [7]:
data.head()

Unnamed: 0,Invoice_id,Cust_id,Txn_date,Product_Sub_category,Product_Category,Qty,Rate,Tax,Total_Amt,Store_type
0,25890929042,266783,2016-09-23,Women,Footwear,4,1321,554.82,5838.82,e-Shop
1,98477711300,266783,2017-10-21,Mens,Clothing,3,93,29.295,308.295,TeleShop
2,8410316370,266783,2018-02-20,Mens,Clothing,1,869,91.245,960.245,e-Shop
3,16999552161,266783,2018-09-02,Non-fiction,Books,2,835,175.35,1845.35,e-Shop
4,36310127403,266784,2017-04-12,Mobiles,Electronics,2,200,42.0,442.0,Brick and Mortar


In [8]:
# Convert 'Txn_date' to datetime
data['Txn_date'] = pd.to_datetime(data['Txn_date'])

In [28]:
# Creating dummy features for categorical types
data_with_dummies = pd.get_dummies(data, columns=['Product_Category'], drop_first=True)


In [10]:
# Standardize of numeric features with a scaler
numeric_features = data_with_dummies.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data_with_dummies[numeric_features] = scaler.fit_transform(data_with_dummies[numeric_features])


In [11]:
data_with_dummies.head()

Unnamed: 0,Invoice_id,Cust_id,Txn_date,Product_Sub_category,Qty,Rate,Tax,Total_Amt,Store_type,Product_Category_Books,Product_Category_Clothing,Product_Category_Electronics,Product_Category_Footwear,Product_Category_Home and kitchen
0,-0.834206,-1.743252,2016-09-23,Women,0.714462,1.307519,1.713115,1.713115,e-Shop,0,0,0,1,0
1,1.670769,-1.743252,2017-10-21,Mens,0.008108,-1.665227,-1.18146,-1.18146,TeleShop,0,1,0,0,0
2,-1.437464,-1.743252,2018-02-20,Mens,-1.404599,0.213316,-0.840241,-0.840241,e-Shop,0,1,0,0,0
3,-1.141048,-1.743252,2018-09-02,Non-fiction,-0.698246,0.131009,-0.376994,-0.376994,e-Shop,1,0,0,0,0
4,-0.474639,-1.742841,2017-04-12,Mobiles,-0.698246,-1.406201,-1.111481,-1.111481,Brick and Mortar,0,0,1,0,0


In [12]:
# Spliting the data into training and testing subsets
# Training: 2016-2017, Testing: 2018
train_data = data_with_dummies[data_with_dummies['Txn_date'].dt.year < 2018]
test_data = data_with_dummies[data_with_dummies['Txn_date'].dt.year == 2018]

In [13]:
# Preparing data for forecasting with Facebook Prophet (separate models for each category)
# Keeping only necessary columns for forecasting ('Txn_date' and 'Total_Amt')
data_forecast = data[['Txn_date', 'Total_Amt', 'Product_Category']]

In [16]:
# Splitting the data for each category
unique_product_categories = data_forecast['Product_Category'].unique()
prophet_ready_datasets = {}

for category in unique_product_categories:
    # Filtering and format training data
    train_data_forecast = data_forecast[data_forecast['Txn_date'].dt.year < 2018]
    train_data_forecast = train_data_forecast[train_data_forecast['Product_Category'] == category][['Txn_date', 'Total_Amt']].rename(columns={'Txn_date': 'ds', 'Total_Amt': 'y'})
    
    # Filtering and format testing data
    test_data_forecast = data_forecast[data_forecast['Txn_date'].dt.year == 2018]
    test_data_forecast = test_data_forecast[test_data_forecast['Product_Category'] == category][['Txn_date', 'Total_Amt']].rename(columns={'Txn_date': 'ds', 'Total_Amt': 'y'})
    
    # Storing formatted data
    prophet_ready_datasets[category] = {'train': train_data_forecast, 'test': test_data_forecast}


In [17]:
# First few entries of the formatted train and test sets for the first category
prophet_ready_datasets[unique_product_categories[0]]['train'].head(), prophet_ready_datasets[unique_product_categories[0]]['test'].head()


(           ds         y
 0  2016-09-23  5838.820
 7  2016-03-15  6911.775
 8  2016-10-24  3135.990
 10 2017-01-07  3540.420
 14 2016-09-13  1927.120,
             ds         y
 13  2018-02-13  2276.300
 17  2018-12-02  1312.740
 46  2018-02-23  1201.135
 67  2018-09-08    81.770
 182 2018-05-19  5502.900)

In [25]:
# Directory path
directory_path = '~/Documents/GitHub/Capstone-II/store_project/pre_processing_data'

full_directory_path = os.path.expanduser(directory_path)

In [27]:
# File paths for saving
preprocessed_data_path = os.path.join(full_directory_path, 'preprocessed_data.csv')
train_data_path = os.path.join(full_directory_path, 'train_data.csv')
test_data_path = os.path.join(full_directory_path, 'test_data.csv')

# Saving DataFrames
data.to_csv(preprocessed_data_path, index=False)
train_data.to_csv(train_data_path, index=False)
test_data.to_csv(test_data_path, index=False)

print("Files saved at:", full_directory_path)

Files saved at: /Users/alexib/Documents/GitHub/Capstone-II/store_project/pre_processing_data
