In [1]:
import numpy as np
from numpy import cov, std

Opening the dataset with UTF-8 encoding and separating the titles from the data (since the python python pointer moves forward while reading data)

In [2]:
f = open('geekmate-data/dataset', 'rt', encoding='utf-8-sig')
titles = f.readline()
raw_data = f.readlines()

Creating dictionary mapping titles to indices and vice versa for later. Since, the `titles` string consists of `\n` at the end, we re disposing it first and then splitting it into individual titles.

In [3]:
titles = titles.split('\n')[0].split(',')
title_to_ind = {title:index for index,title in enumerate(titles)}
ind_to_title = {index:title for title,index in title_to_ind.items()}

Here, we are first creating a nested list of all data elements by going through every row, removing `\n` and spliting the row into col volues. Then, we feed the nested list to the `numpy.asarray()` for faster computations hereon.

In [4]:
data = np.asarray([[int(col) for col in row.split('\n')[0].split(',')] for row in raw_data])
print('Number of examples in dataset:', data.shape[0])
print('NUmber of columns in dataset: ', data.shape[1])
print('Title Names:', titles)

Number of examples in dataset: 10999
NUmber of columns in dataset:  12
Title Names: ['TransactionID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered_in_pct', 'Weight_in_gms', 'Reached_on_Time']


All options consist of results which can be fully/partially pre-computed, so as to display quickly to the user

In [5]:
num_features = len(titles)

#Calculating the coorelation matrix (For Option 1)
corr_matrix = np.empty([num_features, num_features])

for i in range(num_features):
    for j in range(num_features):
        X = data[:, i]
        Y = data[:, j]
        #Since, we are only concerned with the corr of X with Y we take the corr[0,1] element
        corr_matrix[i,j] = round((cov(X, Y)/(std(X) * std(Y)))[0][1], 3)
        

#Calculating the dataset examples splits for each warehouse block number (For option 2)
sorted_data = data[data[:,1].argsort()]
splits = np.split(sorted_data[:,0]-1, np.unique(sorted_data[:,1], return_index=True)[1][1:])
splits = {ind+1:split for ind,split in enumerate(splits)}


#Calculate the orders with least cost/weight ratio (For option 3)
my_func = lambda x: np.round(x[:, title_to_ind['Cost_of_the_Product']]/x[:, title_to_ind['Weight_in_gms']], 6)
ratio = my_func(data)
ratio_col = np.reshape(ratio, (ratio.shape[0],1))

#Appending the new feature to the dataset and title dict
feature_name = 'Cost/Weight Ratio - in US$/g'
title_to_ind[feature_name] = 12
ind_to_title[12] = feature_name

new_data = np.append(data.astype('object'), ratio_col, axis=1)

#Find the 30 orders with least ratio
least_ratio_data = new_data[new_data[:, title_to_ind[feature_name]].argsort()][:31]

`corr_matrix` now consists of the Pearson's Correlation coefficient of each feature with each other

In [6]:
print(corr_matrix)

[[ 1.     0.    -0.002  0.189 -0.006  0.197  0.145 -0.056  0.002 -0.598
   0.278 -0.412]
 [ 0.     1.     0.001  0.014  0.01  -0.007 -0.005 -0.002  0.004  0.01
   0.004  0.005]
 [-0.002  0.001  1.    -0.02   0.002  0.007 -0.002 -0.01   0.011  0.009
  -0.001 -0.001]
 [ 0.189  0.014 -0.02   1.     0.012  0.323  0.181 -0.058 -0.003 -0.131
  -0.277 -0.067]
 [-0.006  0.01   0.002  0.012  1.     0.009  0.013  0.004 -0.003 -0.003
  -0.002  0.013]
 [ 0.197 -0.007  0.007  0.323  0.009  1.     0.124 -0.047 -0.02  -0.138
  -0.133 -0.074]
 [ 0.145 -0.005 -0.002  0.181  0.013  0.124  1.     0.027  0.009 -0.083
  -0.168 -0.056]
 [-0.056 -0.002 -0.01  -0.058  0.004 -0.047  0.027  1.     0.003  0.026
   0.093  0.021]
 [ 0.002  0.004  0.011 -0.003 -0.003 -0.02   0.009  0.003  1.     0.012
  -0.004 -0.005]
 [-0.598  0.01   0.009 -0.131 -0.003 -0.138 -0.083  0.026  0.012  1.
  -0.376  0.397]
 [ 0.278  0.004 -0.001 -0.277 -0.002 -0.133 -0.168  0.093 -0.004 -0.376
   1.    -0.269]
 [-0.412  0.005 -0.001 -0

`splits` dict now maps each block number with the dataset example indices having the respective block number

In [7]:
splits

{1: array([ 6284,  7838,  4214, ...,   140,  5180, 10880]),
 2: array([2859, 3495, 6051, ..., 8961, 9267, 2769]),
 3: array([5056, 4954, 2572, ..., 3316, 4234, 4228]),
 4: array([10938, 10986, 10992, ...,  5856,  6060,  5706]),
 5: array([9769, 1699, 1639, ..., 3373, 8455, 8641])}

`least_ratio_data` contains the 30 orders with the least cost by weight ratio

In [8]:
least_ratio_data

array([[4032, 5, 3, 6, 5, 98, 3, 1, 2, 8, 5946, 0, 0.016482],
       [7930, 2, 2, 4, 4, 97, 6, 1, 1, 4, 5801, 0, 0.016721],
       [9784, 2, 3, 4, 3, 99, 6, 2, 1, 9, 5903, 0, 0.016771],
       [7893, 1, 3, 6, 2, 99, 3, 1, 1, 8, 5872, 0, 0.01686],
       [6769, 4, 3, 4, 1, 100, 6, 2, 1, 6, 5841, 0, 0.01712],
       [8126, 5, 3, 3, 1, 102, 4, 2, 2, 4, 5936, 1, 0.017183],
       [3612, 5, 3, 2, 1, 100, 3, 1, 1, 1, 5776, 1, 0.017313],
       [4131, 1, 3, 6, 3, 100, 4, 1, 1, 3, 5756, 0, 0.017373],
       [5146, 2, 3, 5, 4, 97, 6, 3, 1, 7, 5509, 1, 0.017608],
       [3476, 5, 3, 6, 4, 100, 5, 1, 2, 7, 5668, 1, 0.017643],
       [9486, 5, 3, 3, 5, 98, 3, 2, 2, 3, 5553, 1, 0.017648],
       [9513, 1, 3, 4, 5, 102, 5, 1, 2, 3, 5767, 0, 0.017687],
       [4831, 4, 3, 4, 4, 106, 2, 1, 1, 6, 5984, 0, 0.017714],
       [6687, 1, 2, 4, 5, 105, 5, 3, 1, 4, 5824, 1, 0.018029],
       [8144, 5, 3, 4, 3, 106, 2, 2, 2, 6, 5867, 0, 0.018067],
       [10035, 1, 3, 4, 4, 102, 3, 3, 2, 9, 5644, 1, 0.018072],

Pack the pre-computed data into a dict for easy access

In [9]:
computed_data = {
    'corr_matrix': corr_matrix,
    'splits': splits,
    'least_ratio_data': least_ratio_data
}

In [10]:
#Utility Functions
def display_main_menu():
    print('Main Menu')
    print('1.\tCompute Correlation')
    print('2.\tRanked List of 30 Largest Shipments (by warehouse block)')
    print('3.\tList of 30 Orders with Lowest Cost/Weight Ratios')
    print('0.\tExit\n')
    
    option = int(input('Enter your option: '))
    return option

def display_dataset(data, titles):
    print('\n')
    for title in titles:
        print(title, end='  ')
    print('\n')
    for row in data:
        for col in row:
            print(col, end='\t')
        print('\n')

#Functions for handling each individual option
def find_correlation(corr_matrix):
    print('\nList of header names for correlation\n\n')
    for ind, title in enumerate(titles):
        print(ind, title)
    x = int(input('\nEnter the number for the first quantity:'))
    y = int(input('Enter the number for the second quantity:'))
    print('\nThe correlation between', ind_to_title[x], 'and', ind_to_title[y], 'is', corr_matrix[x,y], '\n')
    

def display_largest_shipments(splits):
    block_number = int(input('\nEnter the warehouse block number(1 to 5): '))
    print('\nList of 30 Largest Shipments by Weight for Warehouse Block', block_number)
    #Retriving all the data for all the examples with given block number
    block_data = data[splits[block_number]]
    #Sorting the data in descending order according to weight
    sorted_data = block_data[block_data[:, title_to_ind['Weight_in_gms']].argsort()[::-1]][:31]
    display_dataset(sorted_data, titles)

    
def display_least_ratio(least_ratio_data):
    print('\nList of 30 Orders with Lowest Cost/Weight Ratio')
    #print(least_ratio_data)
    display_dataset(least_ratio_data, titles + ['Cost/Weight Ratio - in US$/g'])


#Main Function for running the program
def start_program(computed_data):
    while(True):
        option = display_main_menu()
        if option==0:
            print('Exiting now...')
            break
        elif option==1:
            find_correlation(computed_data['corr_matrix'])
        elif option==2:
            display_largest_shipments(computed_data['splits'])
        elif option==3:
            display_least_ratio(computed_data['least_ratio_data'])
        else:
            print('Invalid option entered. Please try again')

In [11]:
start_program(computed_data)

Main Menu
1.	Compute Correlation
2.	Ranked List of 30 Largest Shipments (by warehouse block)
3.	List of 30 Orders with Lowest Cost/Weight Ratios
0.	Exit

Enter your option: 1

List of header names for correlation


0 TransactionID
1 Warehouse_block
2 Mode_of_Shipment
3 Customer_care_calls
4 Customer_rating
5 Cost_of_the_Product
6 Prior_purchases
7 Product_importance
8 Gender
9 Discount_offered_in_pct
10 Weight_in_gms
11 Reached_on_Time

Enter the number for the first quantity:3
Enter the number for the second quantity:6

The correlation between Customer_care_calls and Prior_purchases is 0.181 

Main Menu
1.	Compute Correlation
2.	Ranked List of 30 Largest Shipments (by warehouse block)
3.	List of 30 Orders with Lowest Cost/Weight Ratios
0.	Exit

Enter your option: 2

Enter the warehouse block number(1 to 5): 3

List of 30 Largest Shipments by Weight for Warehouse Block 3


TransactionID  Warehouse_block  Mode_of_Shipment  Customer_care_calls  Customer_rating  Cost_of_the_Product  Prior

In [None]:
#End of code


Written by Nikhil Bartwal

Github: https://github.com/NikhilBartwal

Thanks for reading!
