In [23]:
import pandas as pd
import numpy as np
import datetime
import random
from app.models import clean

In [24]:
%ls app/data/

[0m[01;34mdb[0m/  items.csv  [01;34mtmp[0m/


In [3]:
with open("notes_about_csv_file.txt") as notes:
    print(notes.read())

The items.csv file is structured as such:

item_id	  item_group  cubic_volume_ft
10413	  A	          0.1
10341	  A	          0.5
10004	  B	          1.0
80014	  C	          0.3
20242	  B	          0.4
…	      …	          …

Each record in this csv file is representative of a single item.
The item_id field is a unique identifier for the item, while the item_type and cubic_volume_ft fields are attributes of the item.



Clean_csv will need to have it's path changed once data/tmp/ is being implemented and build.items() used

In [39]:
%%bash 
head app/data/items.csv
tail app/data/items.csv

item_id,item_group,cubic_volume_ft,
7958,A,0.22,
5269,A,0.19,
5134,A,0.47,
8294,A,0.97,
7889,A,0.85,
4836,A,1.09,
4235,B,1.28,
9322,B,0.37,
7094,B,0.82,
4010,A,0.43,
6583,A,0.96,
9371,A,1.19,
9372,A,0.44,
,,,
,,,
,,,
,,,
,,,
,,,

In [79]:
def split_csv():
    """
    This function reads in data from items.csv
    
    Returns a tuple containing Dataframes: (No NaN, NaN)
    Thus 'splitting' the csv
    This allows the NaN values to be 'preserved'
    and dealt with however the user would like
    
    Future implementation will allow multiple csv files
    In this implementation this module will be deprecated
    A new pipeline package will be implemented containing this file
    """
    
    import pandas as pd

    # Skip the header column,
    # Ignore the trailing comma w/each row
    # Enfore column name scheme
    
    stock = pd.read_csv("app/data/items.csv", 
                        skiprows=0, 
                        index_col=False,
                        usecols= ['item_id', 'item_group', 'cubic_volume_ft'])

    return  (stock.dropna(), stock.isna())

In [47]:
%timeit clean_csv()

9.77 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [80]:
%timeit clean_csv()

8.9 ms ± 78.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [77]:
test,_ = clean_csv()

In [78]:
test.head()

Unnamed: 0,item_id,item_group,cubic_volume_ft
0,7958.0,A,0.22
1,5269.0,A,0.19
2,5134.0,A,0.47
3,8294.0,A,0.97
4,7889.0,A,0.85


Clean should be renamed to extract_csv and then a pipeline module build. pipline.extract_csv, pipeline.transform() which will include any DataFrame transformations that need to happen prior to data processing (namely sorting)

In [5]:
items,_ = split_csv().sort_values("cubic_volume_ft",
                                ascending=True)

In [6]:
items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 161 to 81
Data columns (total 3 columns):
item_id            200 non-null float64
item_group         200 non-null object
cubic_volume_ft    200 non-null float64
dtypes: float64(2), object(1)
memory usage: 6.2+ KB


In [7]:
items.head()

Unnamed: 0,item_id,item_group,cubic_volume_ft
161,8178.0,C,0.11
179,9667.0,B,0.11
163,4093.0,C,0.12
135,1367.0,D,0.12
56,2194.0,C,0.13


In [8]:
def dataframe_generator():
    import random
    import pandas as pd
    return (pd.DataFrame(data = {'key': [random.randint(0,199) for x in range(random.randint(175,225))]})
              .merge(clean_csv(), 
                     left_on='key', 
                     right_index=True)
              .drop('key',
                    axis = 1)
              .reset_index(drop=True)
           )

In [9]:
def csvTestData():    
    # Builds anywhere from 150-250 rows of data that matches what comes from items.csv
    # Creates random 'keys' that it merges with the stock data based on index
    # returns the data as .csv's
    
    import random
    import glob
    import re
    
    # Checks to make sure there are no files
    # If there are no files, file name should start with 1
    if not sorted([int(re.sub("[^0-9]", "", files)) for files in glob.glob("app/data/tmp/*.csv")]):
        count = '1'
    else : # Start naming at whatever we're at +1
        count = sorted([int(re.sub("[^0-9]", "", files)) for files in glob.glob("app/data/tmp/*.csv")])[-1] + 1

    return (pd.DataFrame(data = {'key': [random.randint(0,199) for x in range(random.randint(150,250))]})
              .merge(clean_csv(), 
                     left_on='key', 
                     right_index=True)
              .drop('key',
                    axis = 1)
              .reset_index(drop=True)
           ).to_csv("app/data/tmp/items"+str(count)+".csv", 
                    index=False)

In [10]:
csvTestData() # Data Generator

In [11]:
def stockFromDataTMP():
    
    """
    Check app/data/tmp/ for any .csv data
    Append all the data and return the result
    result will be a single DataFrame
    """
    
    import glob
    
    # It's nice to assume clean data, and to be right for once
    
    stock = pd.DataFrame()
    for csv in glob.glob("app/data/tmp/*.csv"):
        stock = stock.append(pd.read_csv(csv))
        
    if stock.empty:
        return stock
    
    else :
        return (stock.sort_values('cubic_volume_ft')
                     .reset_index(drop=True)
               )

In [12]:
def generate_shipment_id():
    import re
    return int(re.sub("[^0-9]", "", str(datetime.datetime.today()))[:17])

Future improvements will use arrays of idx and vol^3 zipped together for speed improvements.

The result will be a dictionary of shipment_id and idx this will be able to be merged to result in final shipment.

This implementation will have to be tested to prove speed improvements exist from this.

In [13]:
def shipments(items) :
    # Create a blank shipment sheet
    shipment = {}
    
    while items.empty == False :

        # Get the largest item by cubic volume and remove from items
        bundle, items = items.tail(1), items.drop(items.tail(1).index, axis=0)
        
        # Filter the remaining items by what CAN still fit in the box
        # Grab the index of the item and the item
        for index, item in (items[items.cubic_volume_ft.values < (1.58 - bundle.cubic_volume_ft.values)]
                            .sort_values("cubic_volume_ft",
                                         ascending=False)
                           ).iterrows():
            
            # If there is no item in items that could fit into the bundle break out of the matrix
            if (bundle.cubic_volume_ft.sum() + items.cubic_volume_ft.values.min()) > 1.58 :
                break
                
            # If it fits it sits
            # Add the item to the bundle
            # Drop item from the items
            elif (bundle.cubic_volume_ft.sum() + item.cubic_volume_ft) <= 1.58 :
                item, items = (item, items.drop(index))
                bundle = bundle.append(item)
                
        #Issue a shipment id to the bundle
        shipment[generate_shipment_id()] = bundle

    return shipment

In [14]:
def test(items) :
    # Create a blank shipment sheet
    shipment = {'item_id':{},
               'item_group':{},
               'cubic_volume_ft':{}
               }
    
    while items.empty == False :

        # Get the largest item by cubic volume and remove from items
        bundle, items = items.tail(1), items.drop(items.tail(1).index, axis=0)
        
        # Generate shipment_id
        shipment_id = generate_shipment_id()

        shipment['item_id'].update({(shipment_id, 
                                bundle.index[0]) : bundle.item_id.values[0]})

        shipment['item_group'].update({(shipment_id, 
                                   bundle.index[0]) : bundle.item_group.values[0]})

        shipment['cubic_volume_ft'].update({(shipment_id, 
                                        bundle.index[0]) : bundle.cubic_volume_ft.values[0]})

        bundle_volume = shipment['cubic_volume_ft'][(shipment_id, 
                                                     bundle.index[0])]
        
        # Filter the remaining items by what CAN still fit in the box
        # Grab the index of the item and the item
        for index, item in (items[items.cubic_volume_ft.values < (1.58 - bundle_volume)]
                            .sort_values("cubic_volume_ft",
                                         ascending=False)
                           ).iterrows():
            
            # If there is no item in items that could fit into the bundle break out of the matrix
            if (bundle_volume + items.cubic_volume_ft.values.min()) > 1.58 :
                break
                
            # If it fits it sits
            # Add the item to the bundle
            # Drop item from the items
            elif (bundle_volume + item.cubic_volume_ft) <= 1.58 :
                item, items = (item, items.drop(index))
                shipment['item_id'].update({(shipment_id, 
                                        index) : item.item_id
                                      })
                shipment['item_group'].update({(shipment_id, 
                                           index) : item.item_group
                                        })
                shipment['cubic_volume_ft'].update({(shipment_id, 
                                                index) : item.cubic_volume_ft
                                              })
                
                bundle_volume += shipment['cubic_volume_ft'][(shipment_id, index)]
    return shipment

In [15]:
items = stockFromDataTMP()

In [16]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
item_id            220 non-null float64
item_group         220 non-null object
cubic_volume_ft    220 non-null float64
dtypes: float64(2), object(1)
memory usage: 5.2+ KB


In [17]:
shipment = pd.DataFrame(test(items))

In [18]:
def summary(shipment):
    
    # Build initial summaries based on items and cubic volume in feet
    data = {'Total Items' : len(shipment.item_id.values),
            'Total Cubic Volume in Feet' : shipment.cubic_volume_ft.values.sum(),
            'Total Item Groups' : len(shipment.item_group.unique())}
    
    # Check for shipment id and build additional shipment summaries
    if shipment.index.get_level_values(0).any() :
        shipment_id = shipment.index.get_level_values(0).unique()
        data['Total shipments'] = len(shipment_id)
        data['Shipment Item Ratio'] = round(len(shipment.item_id.values) / len(shipment_id),2)
        data['Cubic Volume not Utilized'] = (1.58*len(shipment_id) - 
                                             shipment.cubic_volume_ft.values.sum())
        data['Percent Cubic Volume not Utilized'] = round(((1.58 * len(shipment_id) - 
                                                            shipment.cubic_volume_ft.values.sum()) / 
                                                     shipment.cubic_volume_ft.values.sum()) * 100, 2)
    # return resulting summary as a DataFrame
    return (pd.DataFrame(data, 
                         index=['Details'])
           )

In [19]:
summaries = summary(shipment)

In [20]:
summaries.head()

Unnamed: 0,Total Items,Total Cubic Volume in Feet,Total Item Groups,Total shipments,Shipment Item Ratio,Cubic Volume not Utilized,Percent Cubic Volume not Utilized
Details,220,152.39,4,98,2.24,2.45,1.61


## Grouping Prototypes

In [21]:
def get_groups(items):
    if 'item_group' in items.keys() :
        return items.item_group.unique()
    else :
        return None

In [22]:
shipments_filtered = {}
for group in get_groups(items):
    stock_filtered = items[items.item_group.values == group]
    shipments_filtered[group] = shipments(stock_filtered)