# Get the data

In [1]:
import numpy as np
import pandas as pd
import psycopg2
from psycopg2 import Error
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
try:
    connection = psycopg2.connect(user="lvf7916",
                                  password="xxx",
                                  host="pg.analytics.northwestern.edu",
                                  port="5432",
                                  database="everything2023")

    cursor = connection.cursor()
    cursor.execute("SELECT version();")
    record = cursor.fetchone()

except (Exception, Error) as error:
    print("Error while connecting to PostgreSQL", error)

In [2]:
tx = pd.read_sql("SELECT * FROM group_5.trnsact;", connection);
tx

Unnamed: 0,SKU,STORE,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT
0,3,202,2005-01-18,P,1,0.0,30.00,30.00
1,3,202,2005-01-29,R,1,0.0,30.00,30.00
2,3,303,2004-08-18,P,1,0.0,12.00,12.00
3,3,709,2005-08-14,P,1,0.0,30.00,30.00
4,3,802,2005-08-09,P,1,440.0,30.00,30.00
...,...,...,...,...,...,...,...,...
120916891,9999997,7507,2005-04-23,P,1,39.0,39.00,39.00
120916892,9999997,7907,2005-05-03,P,1,39.0,39.00,39.00
120916893,9999997,7907,2005-05-04,R,1,39.0,39.00,39.00
120916894,9999997,7907,2005-05-04,P,1,39.0,39.00,39.00


In [5]:
skst = pd.read_sql("SELECT * FROM group_5.skstinfo;", connection);
skst

Unnamed: 0,SKU,STORE,COST,RETAIL
0,3,102,123.36,440.00
1,3,103,123.36,440.00
2,3,104,123.36,440.00
3,3,202,123.36,440.00
4,3,203,123.36,440.00
...,...,...,...,...
39230141,9999997,2007,15.00,19.50
39230142,9999997,2707,15.00,9.75
39230143,9999997,3307,15.00,19.50
39230144,9999997,7507,15.00,19.50


In [6]:
strinfo = pd.read_sql("SELECT * FROM group_5.strinfo;", connection);
strinfo

Unnamed: 0,STORE,CITY,STATE,ZIP
0,2,ST. PETERSBURG,FL,33710
1,3,ST. LOUIS,MO,63126
2,4,LITTLE ROCK,AR,72201
3,7,FORT WORTH,TX,76137
4,9,TEMPE,AZ,85281
...,...,...,...,...
448,9808,GILBERT,AZ,85233
449,9812,METAIRIE,LA,70006
450,9900,LITTLE ROCK,AR,72201
451,9906,LITTLE ROCK,AR,72201


In [4]:
if (connection):
    cursor.close()
    connection.close()
    print("PostgreSQL connection is closed") 

In [7]:
skst['SKU'] = skst['SKU'].astype(int)
skst['STORE'] = skst['STORE'].astype(int)
skst.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39230146 entries, 0 to 39230145
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   SKU     int64  
 1   STORE   int64  
 2   COST    float64
 3   RETAIL  float64
dtypes: float64(2), int64(2)
memory usage: 1.5 GB


In [8]:
result_df = pd.merge(skst, tx, on=["SKU", "STORE"], how="right")
result = pd.merge(result_df, strinfo, on=["STORE"], how="left")
result = result.fillna(0)
result

Unnamed: 0,SKU,STORE,COST,RETAIL,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,CITY,STATE,ZIP
0,3,202,123.36,440.0,2005-01-18,P,1,0.0,30.00,30.00,TAMPA,FL,33612
1,3,202,123.36,440.0,2005-01-29,R,1,0.0,30.00,30.00,TAMPA,FL,33612
2,3,303,0.00,0.0,2004-08-18,P,1,0.0,12.00,12.00,ST. ANN,MO,63074
3,3,709,0.00,0.0,2005-08-14,P,1,0.0,30.00,30.00,GLENDALE,AZ,85308
4,3,802,123.36,440.0,2005-08-09,P,1,440.0,30.00,30.00,CLEARWATER,FL,33761
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120916891,9999997,7507,15.00,19.5,2005-04-23,P,1,39.0,39.00,39.00,HOUSTON,TX,77056
120916892,9999997,7907,15.00,19.5,2005-05-03,P,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546
120916893,9999997,7907,15.00,19.5,2005-05-04,R,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546
120916894,9999997,7907,15.00,19.5,2005-05-04,P,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546


In [2]:
df = result
df

Unnamed: 0,SKU,STORE,COST,RETAIL,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,CITY,STATE,ZIP
0,3,202,123.36,440.0,2005-01-18,P,1,440.0,30.00,30.00,TAMPA,FL,33612
1,3,202,123.36,440.0,2005-01-29,R,1,440.0,30.00,30.00,TAMPA,FL,33612
2,3,303,0.00,0.0,2004-08-18,P,1,0.0,12.00,12.00,ST. ANN,MO,63074
3,3,709,0.00,0.0,2005-08-14,P,1,0.0,30.00,30.00,GLENDALE,AZ,85308
4,3,802,123.36,440.0,2005-08-09,P,1,440.0,30.00,30.00,CLEARWATER,FL,33761
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120916891,9999997,7507,15.00,19.5,2005-04-23,P,1,39.0,39.00,39.00,HOUSTON,TX,77056
120916892,9999997,7907,15.00,19.5,2005-05-03,P,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546
120916893,9999997,7907,15.00,19.5,2005-05-04,R,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546
120916894,9999997,7907,15.00,19.5,2005-05-04,P,1,39.0,39.00,39.00,FRIENDSWOOD,TX,77546


# Set features

In [3]:
df['SALEDATE'] = pd.to_datetime(df['SALEDATE'])
df['Year'] = df['SALEDATE'].dt.year
df['YoYTotalSales'] = df.groupby(['Year', 'STORE'])['RETAIL'].transform('sum')
df['YoYPercentageIncrease'] = df.groupby('STORE')['YoYTotalSales'].pct_change() * 100

df['InventoryTurnover'] = df['COST'] / df['QUANTITY']
df['GrossProfit'] = df['RETAIL'] - (df['COST'] * df['QUANTITY'])
df['YoYGrowth'] = df.groupby('SKU')['AMT'].pct_change(12) * 100
df['ReturnPercentage'] = (df['ORGPRICE'] - df['SPRICE']) / df['ORGPRICE'] * 100

In [5]:
df = df[['SKU', 'STORE', 'STYPE', 'AMT', 'ReturnPercentage', 'InventoryTurnover', 'GrossProfit']]

In [6]:
df.to_csv('final_data_update_by_feature.csv')

In [7]:
df

Unnamed: 0,SKU,STORE,STYPE,AMT,ReturnPercentage,InventoryTurnover,GrossProfit
0,3,202,P,30.00,93.181818,123.36,316.64
1,3,202,R,30.00,93.181818,123.36,316.64
2,3,303,P,12.00,-inf,0.00,0.00
3,3,709,P,30.00,-inf,0.00,0.00
4,3,802,P,30.00,93.181818,123.36,316.64
...,...,...,...,...,...,...,...
120916891,9999997,7507,P,39.00,0.000000,15.00,4.50
120916892,9999997,7907,P,39.00,0.000000,15.00,4.50
120916893,9999997,7907,R,39.00,0.000000,15.00,4.50
120916894,9999997,7907,P,39.00,0.000000,15.00,4.50
