In [45]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import pickle as pkl
import warnings
warnings.filterwarnings("ignore")

In [46]:
data = pd.read_csv(os.path.join('Data','StockX Dataset.csv'))
data

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,09/01/2017,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,"$1,097",$220,9/24/2016,11.0,California
1,09/01/2017,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,$685,$220,11/23/2016,11.0,California
2,09/01/2017,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,$690,$220,11/23/2016,11.0,California
3,09/01/2017,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,"$1,075",$220,11/23/2016,11.5,Kentucky
4,09/01/2017,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,$828,$220,02/11/2017,11.0,Rhode Island
...,...,...,...,...,...,...,...,...
99951,2/13/2019,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$565,$220,12/26/2018,8.0,Oregon
99952,2/13/2019,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$598,$220,12/26/2018,8.5,California
99953,2/13/2019,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$605,$220,12/26/2018,5.5,New York
99954,2/13/2019,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$650,$220,12/26/2018,11.0,California


In [47]:
data[['Order Day', 'Order Month', 'Order Year']] = data['Order Date'].str.split('/', expand=True)
data[['Release Day', 'Release Month', 'Release Year']] = data['Release Date'].str.split('/', expand=True)

In [48]:
data = data[['Order Day', 'Order Month', 'Order Year', 
             'Release Day', 'Release Month', 'Release Year',
             'Brand', 'Sneaker Name', 'Shoe Size', 
             'Buyer Region', 'Retail Price', 'Sale Price']]

In [49]:
data['Order Day'] = pd.to_numeric(data['Order Day'])
data['Order Month'] = pd.to_numeric(data['Order Month'])
data['Order Year'] = pd.to_numeric(data['Order Year'])

data['Release Day'] = pd.to_numeric(data['Release Day'])
data['Release Month'] = pd.to_numeric(data['Release Month'])
data['Release Year'] = pd.to_numeric(data['Release Year'])

In [50]:
data['Sale Price'] = data['Sale Price'].str[1:].replace(',','',regex=True).apply(pd.to_numeric, errors='coerce')

In [51]:
data['Retail Price'] = data['Retail Price'].str[1:].replace(',','',regex=True).apply(pd.to_numeric, errors='coerce')

In [52]:
len(data['Brand'].unique())

2

In [53]:
len(data['Sneaker Name'].unique())

50

In [54]:
len(data['Buyer Region'].unique())

51

In [55]:
len(data['Shoe Size'].unique())

26

In [56]:
# data['Order Date'] = pd.to_datetime(data['Order Date'])
# data['Release Date'] = pd.to_datetime(data['Release Date'])

In [57]:
data

Unnamed: 0,Order Day,Order Month,Order Year,Release Day,Release Month,Release Year,Brand,Sneaker Name,Shoe Size,Buyer Region,Retail Price,Sale Price
0,9,1,2017,9,24,2016,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,11.0,California,220,1097
1,9,1,2017,11,23,2016,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,11.0,California,220,685
2,9,1,2017,11,23,2016,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,11.0,California,220,690
3,9,1,2017,11,23,2016,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,11.5,Kentucky,220,1075
4,9,1,2017,2,11,2017,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,11.0,Rhode Island,220,828
...,...,...,...,...,...,...,...,...,...,...,...,...
99951,2,13,2019,12,26,2018,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,8.0,Oregon,220,565
99952,2,13,2019,12,26,2018,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,8.5,California,220,598
99953,2,13,2019,12,26,2018,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,5.5,New York,220,605
99954,2,13,2019,12,26,2018,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,11.0,California,220,650


In [58]:
os.makedirs('Model', exist_ok=True)

In [59]:
if not os.path.exists(os.path.join('Model', 'brand_le.pkl')):
    brand_le = LabelEncoder()
    brand_le.fit(data['Brand'])
    pkl.dump(brand_le, open(os.path.join('Model','brand_le.pkl'), 'wb'))
else:
    brand_le = pkl.load(open(os.path.join('Model', 'brand_le.pkl'), 'rb'))
        
if not os.path.exists(os.path.join('Model', 'sneakerName_le.pkl')):
    sneakerName_le = LabelEncoder()
    sneakerName_le.fit(data['Sneaker Name'])
    pkl.dump(sneakerName_le, open(os.path.join('Model','sneakerName_le.pkl'), 'wb'))
else:
    sneakerName_le = pkl.load(open(os.path.join('Model', 'sneakerName_le.pkl'), 'rb'))

if not os.path.exists(os.path.join('Model', 'shoeSize_le.pkl')):
    shoeSize_le = LabelEncoder()
    shoeSize_le.fit(data['Shoe Size'])
    pkl.dump(shoeSize_le, open(os.path.join('Model','shoeSize_le.pkl'), 'wb'))
else:
    shoeSize_le = pkl.load(open(os.path.join('Model', 'shoeSize_le.pkl'), 'rb'))

if not os.path.exists(os.path.join('Model', 'buyerRegion_le.pkl')):
    buyerRegion_le = LabelEncoder()
    buyerRegion_le.fit(data['Buyer Region'])
    pkl.dump(buyerRegion_le, open(os.path.join('Model','buyerRegion_le.pkl'), 'wb'))
else:
    buyerRegion_le = pkl.load(open(os.path.join('Model', 'buyerRegion_le.pkl'), 'rb'))

In [60]:
data['Brand'] = brand_le.transform(data['Brand'])
data['Sneaker Name'] = sneakerName_le.transform(data['Sneaker Name'])
data['Shoe Size'] = shoeSize_le.transform(data['Shoe Size'])
data['Buyer Region'] = buyerRegion_le.transform(data['Buyer Region'])

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99956 entries, 0 to 99955
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Order Day      99956 non-null  int64
 1   Order Month    99956 non-null  int64
 2   Order Year     99956 non-null  int64
 3   Release Day    99956 non-null  int64
 4   Release Month  99956 non-null  int64
 5   Release Year   99956 non-null  int64
 6   Brand          99956 non-null  int32
 7   Sneaker Name   99956 non-null  int32
 8   Shoe Size      99956 non-null  int64
 9   Buyer Region   99956 non-null  int32
 10  Retail Price   99956 non-null  int64
 11  Sale Price     99956 non-null  int64
dtypes: int32(3), int64(9)
memory usage: 8.0 MB


In [62]:
boundary = int(round(len(data)*0.8,0))
training, testing = data.iloc[:boundary, :], data.iloc[boundary:, :]
training.shape, testing.shape

((79965, 12), (19991, 12))

In [63]:
X_train, X_test, y_train, y_test = training.iloc[:,:-1], testing.iloc[:,:-1], training.iloc[:,-1], testing.iloc[:,-1]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((79965, 11), (19991, 11), (79965,), (19991,))

In [76]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
abs(lr.score(X_test, y_test))

0.2757120217938107

In [65]:
if not os.path.exists(os.path.join('Model', 'regressor.pkl')):
    pkl.dump(lr, open(os.path.join('Model', 'regressor.pkl'), 'wb'))
else:
    lr = pkl.load(open(os.path.join('Model', 'regressor.pkl'), 'rb'))