In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
### Importing data
cases = pd.read_csv("./cases_sold.csv")
inv = pd.read_csv("./inventory.csv")
spoilage = pd.read_csv("./spoilage.csv")
slot_util = pd.read_csv("./Slot Utilization.csv")

In [3]:
### Fixing data types
inv['BRNCH_CD'] = inv['BRNCH_CD'].astype('string')
inv['TEMP_ZONE'] = inv['TEMP_ZONE'].astype('string')
slot_util['DATE_EXTRACT'] = pd.to_datetime(slot_util['DATE_EXTRACT'])

### Replacing values so they're the same when merging
inv['TEMP_ZONE'] = inv['TEMP_ZONE'].replace(['CLR', 'DRY', 'FRZ'], ['Refrigerated', 'Dry', 'Freezer'])
spoilage['TEMP_ZONE'] = spoilage['TEMP_ZONE'].replace(['CLR', 'DRY', 'FRZ'], ['Refrigerated', 'Dry', 'Freezer'])

In [4]:
### Date ranges for each table
print(min(inv['FISC_YR_WK']), max(inv['FISC_YR_WK']))
print(min(cases['FISC_YR_WK']), max(cases['FISC_YR_WK']))
print(min(spoilage['FISC_YR_WK']), max(spoilage['FISC_YR_WK']))
print(min(slot_util['DATE_EXTRACT']), max(slot_util['DATE_EXTRACT']))

202104 202303
201704 202303
202201 202404
2021-02-14 00:00:00 2023-01-29 00:00:00


In [5]:
### Merging data
# Add fiscal week and quarter columns to slot utilization to join on
def year_week_conv(date):
    if date.week < 10:
        return int(str(date.year) + "0" + str(date.week))
    else:
        return int(str(date.year) + str(date.week))
fw = slot_util['DATE_EXTRACT'].apply(lambda a : year_week_conv(a))
slot_util['FISC_YR_WK'] = fw

# Merge slot_util with cases data
merged = slot_util.merge(cases, how='left', on=['BRNCH_CD', 'FISC_YR_WK'], validate="m:1")
merged['DIV_NBR'] = merged['DIV_NBR'].fillna(0)

# Merge with spoilage data
merged = merged.merge(spoilage, how='left', left_on=['BRNCH_CD', 'FISC_YR_WK', 'AREA'], 
                      right_on=['BRNCH_CD', 'FISC_YR_WK', 'TEMP_ZONE'], validate="m:1")

# Merge with inventory data
merged = merged.merge(inv, how='left', left_on=['BRNCH_CD', 'FISC_YR_WK', 'AREA'], 
                      right_on=['BRNCH_CD', 'FISC_YR_WK', 'TEMP_ZONE'], validate="m:1")
merged.head()

Unnamed: 0,WAREHOUSE_LOCN,AREA,BRNCH_CD,FULL_MARKET_NAME,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,FISC_YR_WK,DIV_NBR,DIV_NM,CASES_SOLD,TEMP_ZONE_x,CASES_IMPACTED,SPOILAGE,TEMP_ZONE_y,MAX_WKLY_INVENTORY,LDR_TM
0,2125,Freezer,5O,"MANASSAS (5O, 2125)",Main Warehouse,Reserve,2022-05-16,9358.0,13044,0.717417,202220,2125.0,MANASSAS,329538.154,Freezer,5.0,5.0,Freezer,437449.375,2023-01-27T14:23:26.861-08:00
1,3148,Freezer,4H,"SALT LAKE CITY (4H, 4118)",Off-Site Storage,Reserve,2022-05-03,,0,,202218,4118.0,SALT LAKE CITY,234564.9915,Freezer,18.0,18.0,Freezer,265219.716667,2023-01-27T14:23:26.861-08:00
2,2345,Refrigerated,3J,"BISMARCK (3J, 2345)",Main Warehouse,Reserve,2021-10-19,477.0,905,0.527071,202142,2345.0,BISMARCK,44478.6486,,,,Refrigerated,25167.027778,2023-01-27T14:23:26.861-08:00
3,4135,Freezer,8T,"PHOENIX SYSTEMS (8T, 4135)",Main Warehouse,Reserve,2021-10-08,1330.0,1694,0.785123,202140,4135.0,PHOENIX SYSTEMS,93214.2753,,,,Freezer,100198.833333,2023-01-27T14:23:26.861-08:00
4,1106,Dry,8L,"DETROIT (8L, 1106)",Main Warehouse,Pick,2021-05-12,5585.0,6243,0.894601,202119,1106.0,DETROIT,187122.1838,,,,Dry,352921.831941,2023-01-27T14:23:26.861-08:00


In [6]:
merged.head()

Unnamed: 0,WAREHOUSE_LOCN,AREA,BRNCH_CD,FULL_MARKET_NAME,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,FISC_YR_WK,DIV_NBR,DIV_NM,CASES_SOLD,TEMP_ZONE_x,CASES_IMPACTED,SPOILAGE,TEMP_ZONE_y,MAX_WKLY_INVENTORY,LDR_TM
0,2125,Freezer,5O,"MANASSAS (5O, 2125)",Main Warehouse,Reserve,2022-05-16,9358.0,13044,0.717417,202220,2125.0,MANASSAS,329538.154,Freezer,5.0,5.0,Freezer,437449.375,2023-01-27T14:23:26.861-08:00
1,3148,Freezer,4H,"SALT LAKE CITY (4H, 4118)",Off-Site Storage,Reserve,2022-05-03,,0,,202218,4118.0,SALT LAKE CITY,234564.9915,Freezer,18.0,18.0,Freezer,265219.716667,2023-01-27T14:23:26.861-08:00
2,2345,Refrigerated,3J,"BISMARCK (3J, 2345)",Main Warehouse,Reserve,2021-10-19,477.0,905,0.527071,202142,2345.0,BISMARCK,44478.6486,,,,Refrigerated,25167.027778,2023-01-27T14:23:26.861-08:00
3,4135,Freezer,8T,"PHOENIX SYSTEMS (8T, 4135)",Main Warehouse,Reserve,2021-10-08,1330.0,1694,0.785123,202140,4135.0,PHOENIX SYSTEMS,93214.2753,,,,Freezer,100198.833333,2023-01-27T14:23:26.861-08:00
4,1106,Dry,8L,"DETROIT (8L, 1106)",Main Warehouse,Pick,2021-05-12,5585.0,6243,0.894601,202119,1106.0,DETROIT,187122.1838,,,,Dry,352921.831941,2023-01-27T14:23:26.861-08:00


In [7]:
### Removing unnecessary columns
merged = merged.drop(['LDR_TM', 'FULL_MARKET_NAME', 'DIV_NM', 'WAREHOUSE_LOCN',
                      'SUM(PALLET_USED)', 'SUM(PALLET_POSITIONS)', 'TEMP_ZONE_x', "TEMP_ZONE_y",
                      'DIV_NBR', 'FISC_YR_WK'], axis=1)

In [8]:
### Aggregating data
merged2 = merged.groupby(['BRNCH_CD', 'DATE_EXTRACT', 'AREA', 'PICK_TYPE']).agg(np.mean)

In [9]:
merged2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
BRNCH_CD,DATE_EXTRACT,AREA,PICK_TYPE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2G,2021-02-14,Dry,Pick,0.862603,117791.0489,,,252529.141665
2G,2021-02-14,Dry,Reserve,0.747419,117791.0489,,,252529.141665
2G,2021-02-14,Freezer,Pick,0.457666,117791.0489,,,166128.133334
2G,2021-02-14,Freezer,Reserve,0.424028,117791.0489,,,166128.133334
2G,2021-02-14,Refrigerated,Pick,0.432943,117791.0489,,,61247.975002


In [10]:
merged2.apply(lambda a : np.sum(a.isna()))

CAPACITY               19410
CASES_SOLD             40396
CASES_IMPACTED        189042
SPOILAGE              189042
MAX_WKLY_INVENTORY     14704
dtype: int64

In [11]:
merged.apply(lambda a : np.sum(a.isna()))

AREA                       0
BRNCH_CD                   0
STORAGE_TYPE               0
PICK_TYPE                  0
DATE_EXTRACT               0
CAPACITY              583630
CASES_SOLD             94226
CASES_IMPACTED        634851
SPOILAGE              634851
MAX_WKLY_INVENTORY     31436
dtype: int64

In [12]:
merged[merged['CAPACITY'].isna()]

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
1,Freezer,4H,Off-Site Storage,Reserve,2022-05-03,,234564.9915,18.0,18.00,265219.716667
5,Dry,6W,Inactive,Pick,2022-12-22,,308364.5405,71.0,71.00,442039.263331
6,Refrigerated,4P,Inactive,Pick,2021-12-11,,,,,
16,Dry,3J,Inactive,Pick,2022-10-18,,41188.2643,21.0,21.75,65629.464444
22,Refrigerated,6Z,Off-Site Storage,Pick,2022-02-01,,170585.5797,66.0,66.50,88568.499998
...,...,...,...,...,...,...,...,...,...,...
1138484,Refrigerated,6A,Inactive,Reserve,2021-10-02,,33838.0470,,,21837.719047
1138487,Freezer,4H,Virtual Slot,Pick,2022-01-28,,205407.6029,11.0,11.00,264675.441665
1138488,Freezer,2R,Inactive,Reserve,2022-03-24,,101251.2227,2.0,2.00,139718.466667
1138489,Refrigerated,2R,Inactive,Reserve,2022-10-24,,113990.0824,304.0,304.00,51126.291667


In [46]:
# CAPACITY is only missing when pallet_used is NaN
print(slot_util[slot_util['CAPACITY'].isna()]['SUM(PALLET_USED)'].shape)
print(sum(slot_util[slot_util['CAPACITY'].isna()]['SUM(PALLET_USED)'].isna()))
slot_util[slot_util['CAPACITY'].isna()]['SUM(PALLET_POSITIONS)']

(583630,)
583630


1          0
5          0
6          0
16         0
22         0
          ..
1138484    0
1138487    0
1138488    0
1138489    0
1138490    0
Name: SUM(PALLET_POSITIONS), Length: 583630, dtype: int64

In [14]:
# Cases sold has no missing values prior to merging
# Missing values are the result of cases table not having data for the time period and branch in slot_util
# Or cases just using a completely different set of BRNCH_CD values
print(np.unique(cases['BRNCH_CD']))
print(np.unique(slot_util['BRNCH_CD']))

['2G' '2I' '2J' '2L' '2N' '2O' '2R' '2Z' '3D' '3F' '3J' '3K' '3L' '3M'
 '3V' '3W' '3Y' '3Z' '4C' '4H' '4I' '4J' '4O' '4P' '4Q' '4R' '4U' '4V'
 '5D' '5E' '5G' '5I' '5O' '5T' '5Y' '5Z' '6A' '6B' '6D' '6F' '6G' '6H'
 '6I' '6J' '6N' '6U' '6V' '6W' '6Y' '6Z' '8101' '8102' '8103' '8104'
 '8105' '8106' '8107' '8108' '8A' '8B' '8E' '8L' '8N' '8O' '8S' '8T' '8U'
 '8V' '9A' '9B' '9D' '9I' '9J' '9L' '9O' '9P' '9Q' '9U']
['2G' '2I' '2J' '2L' '2N' '2O' '2R' '2Z' '3D' '3F' '3J' '3K' '3L' '3M'
 '3V' '3W' '3Y' '3Z' '4C' '4H' '4I' '4J' '4O' '4P' '4Q' '4R' '4S' '4U'
 '4V' '5D' '5E' '5G' '5I' '5O' '5T' '5Y' '5Z' '6A' '6B' '6D' '6F' '6G'
 '6H' '6I' '6J' '6N' '6U' '6V' '6W' '6Y' '6Z' '7Q' '8A' '8B' '8D' '8E'
 '8G' '8L' '8N' '8O' '8S' '8T' '8U' '8V' '9A' '9B' '9D' '9F' '9H' '9I'
 '9J' '9L' '9O' '9P' '9Q' '9U' 'E5' 'X1' 'X6' 'X7']


In [51]:
slot_util[slot_util['BRNCH_CD'].isin(['X1', 'X6', 'X7', 'E5'])]

Unnamed: 0,WAREHOUSE_LOCN,AREA,BRNCH_CD,FULL_MARKET_NAME,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,FISC_YR_WK
13,4811,Refrigerated,E5,"STOCK YARDS-CHARLOTTE (E5, 4811)",Main Warehouse,Pick,2021-11-16,93.0,2507,0.037096,202146
359,4920,Refrigerated,X1,"USF PRC-DALLAS (X1, 4920)",Inactive,Pick,2022-02-06,,0,,202205
472,4811,Refrigerated,E5,"STOCK YARDS-CHARLOTTE (E5, 4811)",Main Warehouse,Reserve,2022-11-09,237.0,355,0.667605,202245
739,4927,Refrigerated,X7,"USF PRC-IOWA (X7, 4927)",Inactive,Pick,2022-12-13,,0,,202250
894,4811,Freezer,E5,"STOCK YARDS-CHARLOTTE (E5, 4811)",Main Warehouse,Reserve,2021-04-09,286.0,1639,0.174496,202114
...,...,...,...,...,...,...,...,...,...,...,...
1138375,4923,Refrigerated,X6,"USF PRC-NORTHEAST (X6, 4923)",9999,Reserve,2021-10-07,,0,,202140
1138391,4923,Refrigerated,X6,"USF PRC-NORTHEAST (X6, 4923)",Main Warehouse,Reserve,2022-10-10,,1,,202241
1138395,4923,Refrigerated,X6,"USF PRC-NORTHEAST (X6, 4923)",Main Warehouse,Reserve,2022-09-13,,1,,202237
1138413,4923,Refrigerated,X6,"USF PRC-NORTHEAST (X6, 4923)",Main Warehouse,Reserve,2022-03-15,,1,,202211


In [50]:
cases[cases['BRNCH_CD'].isin(['8101', '8102'])]

Unnamed: 0,FISC_YR_WK,BRNCH_CD,DIV_NBR,DIV_NM,CASES_SOLD
594,201733,8101,8101,CHEF'STORE-OKLAHOMA CITY,9171.7507
595,201734,8102,8102,CHEF'STORE-CHARLOTTE,14039.8334
601,202218,8102,8102,CHEF'STORE-CHARLOTTE,15553.3380
607,202251,8102,8102,CHEF'STORE-CHARLOTTE,11452.2907
611,201829,8102,8102,CHEF'STORE-CHARLOTTE,15985.0355
...,...,...,...,...,...
21008,201911,8102,8102,CHEF'STORE-CHARLOTTE,15877.6253
21009,201908,8101,8101,CHEF'STORE-OKLAHOMA CITY,9976.9373
21012,202116,8101,8101,CHEF'STORE-OKLAHOMA CITY,15596.6385
21013,201939,8101,8101,CHEF'STORE-OKLAHOMA CITY,13615.9599


In [15]:
merged[merged['CASES_SOLD'].isna()]

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
6,Refrigerated,4P,Inactive,Pick,2021-12-11,,,,,
13,Refrigerated,E5,Main Warehouse,Pick,2021-11-16,0.037096,,,,3658.000000
42,Refrigerated,5T,Main Warehouse,Reserve,2021-07-16,0.032044,,,,1514.000000
115,Refrigerated,9P,Main Warehouse,Reserve,2022-01-14,0.290486,,,,108558.659601
118,Refrigerated,9P,Main Warehouse,Reserve,2021-09-22,,,,,
...,...,...,...,...,...,...,...,...,...,...
1138418,Freezer,8D,Virtual Slot,Pick,2022-02-19,,,,,21375.000000
1138442,Dry,9F,Inactive,Reserve,2021-03-13,,,,,0.000000
1138453,Freezer,8D,Virtual Slot,Pick,2021-03-15,,,,,13923.000000
1138479,Refrigerated,X6,Main Warehouse,Reserve,2022-06-24,,,9.0,9.0,10476.000000


In [16]:
cases[(cases['BRNCH_CD'] == '5T')]['FISC_YR_WK'].min()

202131

In [17]:
# Inventory data has no missing values prior to merging
# Missing values are the result of branch not existing at the time period that there's a record in slot_util?
# Most cases of missing inv data are also ones with missing capacity data - 
# the ones with capacity data are all from january of 2023

In [18]:
subset = merged[(merged['MAX_WKLY_INVENTORY'].isna()) & (merged['CAPACITY'].notna())]
subset

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
803,Refrigerated,4H,Main Warehouse,Pick,2023-01-01,0.826835,,313.0,316.766667,
1416,Freezer,5T,Main Warehouse,Pick,2023-01-01,0.913084,,53.0,53.000000,
3476,Dry,3J,Main Warehouse,Pick,2023-01-01,0.867721,,38.0,38.000000,
3852,Refrigerated,2O,Main Warehouse,Reserve,2023-01-01,0.450722,,376.0,376.999999,
5527,Freezer,3K,Main Warehouse,Pick,2023-01-01,0.876936,,101.0,101.000000,
...,...,...,...,...,...,...,...,...,...,...
1137162,Freezer,6Z,9999,Reserve,2023-01-29,0.000000,,48.0,48.000000,
1137689,Freezer,4I,9999,Reserve,2023-01-26,0.000000,,34.0,34.000000,
1137695,Freezer,2G,9999,Reserve,2023-01-26,0.000000,,5.0,5.000000,
1137710,Freezer,9U,9999,Reserve,2023-01-27,0.000000,,4.0,4.000000,


In [19]:
# Spoilage data has no missing values prior to merging
# Missing values are the result of slot_util data covering one year prior to the start of spoilage data and
# sometimes there just aren't entries in spoilage for that particular area/branch/week combination

# Probably default to zero if date range is covered but data not present and then ? if outside date range

In [20]:
subset2 = merged[merged['SPOILAGE'].isna()]
subset2

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
2,Refrigerated,3J,Main Warehouse,Reserve,2021-10-19,0.527071,44478.6486,,,25167.027778
3,Freezer,8T,Main Warehouse,Reserve,2021-10-08,0.785123,93214.2753,,,100198.833333
4,Dry,8L,Main Warehouse,Pick,2021-05-12,0.894601,187122.1838,,,352921.831941
6,Refrigerated,4P,Inactive,Pick,2021-12-11,,,,,
7,Freezer,3Y,Main Warehouse,Pick,2022-01-25,0.801814,184941.9570,,,250058.958334
...,...,...,...,...,...,...,...,...,...,...
1138478,Freezer,9L,Inactive,Reserve,2021-04-14,,92454.0994,,,177393.091665
1138481,Refrigerated,9A,Off-Site Storage,Pick,2022-03-30,,49622.7694,,,34238.655951
1138483,Freezer,8D,Virtual Slot,Pick,2022-07-13,,,,,39367.000000
1138484,Refrigerated,6A,Inactive,Reserve,2021-10-02,,33838.0470,,,21837.719047


In [21]:
spoilage[(spoilage['FISC_YR_WK'] == 202228) & (spoilage['BRNCH_CD'] == "8D")]

Unnamed: 0,FISC_YR_WK,BRNCH_CD,TEMP_ZONE,CASES_IMPACTED,SPOILAGE


In [24]:
merged.head()

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,CAPACITY,CASES_SOLD,CASES_IMPACTED,SPOILAGE,MAX_WKLY_INVENTORY
0,Freezer,5O,Main Warehouse,Reserve,2022-05-16,0.717417,329538.154,5.0,5.0,437449.375
1,Freezer,4H,Off-Site Storage,Reserve,2022-05-03,,234564.9915,18.0,18.0,265219.716667
2,Refrigerated,3J,Main Warehouse,Reserve,2021-10-19,0.527071,44478.6486,,,25167.027778
3,Freezer,8T,Main Warehouse,Reserve,2021-10-08,0.785123,93214.2753,,,100198.833333
4,Dry,8L,Main Warehouse,Pick,2021-05-12,0.894601,187122.1838,,,352921.831941


In [45]:
wide = merged.pivot_table(index=['BRNCH_CD', 'DATE_EXTRACT'], columns=['AREA', 'STORAGE_TYPE', 'PICK_TYPE'],
                   values=['CAPACITY', 'CASES_SOLD','CASES_IMPACTED', 'SPOILAGE', 'MAX_WKLY_INVENTORY']).reset_index()
wide.head()

Unnamed: 0_level_0,BRNCH_CD,DATE_EXTRACT,CAPACITY,CAPACITY,CAPACITY,CAPACITY,CAPACITY,CAPACITY,CAPACITY,CAPACITY,...,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE,SPOILAGE
AREA,Unnamed: 1_level_1,Unnamed: 2_level_1,Dry,Dry,Dry,Dry,Dry,Dry,Dry,Dry,...,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated,Refrigerated
STORAGE_TYPE,Unnamed: 1_level_2,Unnamed: 2_level_2,9999,9999,Inactive,Inactive,Main Warehouse,Main Warehouse,Off-Site Storage,Trailer,...,Inactive,Inactive,Main Warehouse,Main Warehouse,Off-Site Storage,Off-Site Storage,Trailer,Trailer,Virtual Slot,Virtual Slot
PICK_TYPE,Unnamed: 1_level_3,Unnamed: 2_level_3,Pick,Reserve,Pick,Reserve,Pick,Reserve,Reserve,Pick,...,Pick,Reserve,Pick,Reserve,Pick,Reserve,Pick,Reserve,Pick,Reserve
0,2G,2021-02-14,,,,,0.862603,0.747419,,,...,,,,,,,,,,
1,2G,2021-02-15,,,,,0.857569,0.747458,,,...,,,,,,,,,,
2,2G,2021-02-16,,,,,0.860051,0.760835,,,...,,,,,,,,,,
3,2G,2021-02-17,,,,,0.8588,0.743637,,,...,,,,,,,,,,
4,2G,2021-02-18,,,,,0.861294,0.73731,,,...,,,,,,,,,,


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

merged3 = merged.groupby(['DATE_EXTRACT', 'BRNCH_CD']).agg(np.mean).reset_index().interpolate().dropna()
merged3['BRNCH_CD'] = merged3['BRNCH_CD'].astype(str)
# merged3['CAPACITY'] = np.arcsin(merged3['CAPACITY'])
merged3 = pd.get_dummies(merged3)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(merged3.drop(['CAPACITY', 'DATE_EXTRACT'], axis=1), merged3['CAPACITY'], 
                                                    random_state=234)

pipe = Pipeline([#('normalize data', Normalizer()),
                 ('lm', LinearRegression())])
pipe.fit(x_train, y_train)
mean_squared_error(y_test, pipe.predict(x_test))

0.0019001150587873322

In [42]:
pipe.score(x_test, y_test)

0.8673589282438664