***Objective of this notebook is to do similarity based Collaborative Filtering**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import time

import itertools
from datetime import datetime
import pickle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import OneHotEncoder
from sklearn.preprocessing   import OrdinalEncoder

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, f1_score
from scipy.stats import mode

In [None]:
def process_vendors(location,filename):
  vendors = pd.read_csv(os.path.join(os.getcwd(), filename))
  vendors["id"] = vendors["id"].astype('str')
  vendors.rename(columns={'id':'vendor_id'},inplace=True)

  #Remove the unwanted features
  vendor_dropcols = ["vendor_tag_name","vendor_tag","vendor_category_en","authentication_id","created_at","updated_at","OpeningTime","OpeningTime2","primary_tags","is_akeed_delivering","open_close_flags","one_click_vendor","country_id","city_id","display_orders",'sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2', 'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2', 'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2', 'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2', 'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2', 'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2', 'friday_to_time2', 'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2', 'saturday_to_time2']
  vendors = vendors.drop(columns=vendor_dropcols)
  return vendors


Customer Demographics

In [None]:
def process_customer_demo(location,filename):

  customers = pd.read_csv(os.path.join(os.getcwd(), filename)e)
  customers["updated_at"] = pd.to_datetime(customers["updated_at"])
  customers["created_at"] = pd.to_datetime(customers["created_at"])

  #Remove duplicate records by extracting the last updated records
  customers_dedup = customers[customers["updated_at"] == customers.groupby(["akeed_customer_id"])['updated_at'].transform('max')]
  
  #remove trailing spaces in gender and convert to lower case
  customers_dedup.loc[:,"gender"] = customers_dedup["gender"].astype("str").str.rstrip().str.lower()

  #fix missing and incorrect values as 'unknown'
  customers_dedup.loc[~customers_dedup["gender"].isin(["male","female"]),"gender"] = "unknown"
  customers_dedup = customers_dedup[['akeed_customer_id', 'gender', 'verified', 'language']]

  return customers_dedup

Customer Locations

https://datascientyst.com/reverse-geocoding-latitude-longitude-city-country-python-pandas/

In [None]:
def process_customer_location(location,filename):
  
  locations = pd.read_csv(os.path.join(os.getcwd(), filename))
  locations["location_type"].fillna('Null',inplace=True)
  locations['location_type'] = locations['location_type'].map({'Null':0,'Home':1,'Work':2,'Other':3})

  locations["latitude"].fillna(locations.groupby(["customer_id"])["latitude"].transform("mean"),inplace=True)
  locations["longitude"].fillna(locations.groupby(["customer_id"])["longitude"].transform("mean"),inplace=True)
  
  locations["latitude"].fillna(locations["latitude"].mean(),inplace=True)
  locations["longitude"].fillna(locations["longitude"].mean(),inplace=True)

  return locations

In [None]:
def merge_demo_loc(demographics_df,location_df):
  location_demograph = location_df.merge(demographics_df,left_on="customer_id",right_on="akeed_customer_id",how="left")
  location_demograph.drop(columns=["akeed_customer_id"],inplace=True)

  return location_demograph

In [None]:
def calculate_haversine(lon1, lat1, lon2, lat2):
    """
    All args must be of equal length
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    delta_lon = lon2 - lon1
    delta_lat = lat2 - lat1

    haversine_angle = np.sin(delta_lat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2.0)**2
    haversine_distance = 2 * 6371 * np.arcsin(np.sqrt(haversine_angle))
    return haversine_distance

In [None]:
def calculate_vendor_distances(customer_locations,vendor_locations):
  #cols = ["vendor"+str(j) for j in range(100)]
  cols = [vendor for vendor in vendor_locations["vendor_id"]]
  nbr_customer_locations = customer_locations.shape[0]
  customer_vendor_distances = pd.DataFrame(columns=["customer_id","location_number"]+cols)
  customer_vendor_distances[["customer_id","location_number"]] = customer_locations[["customer_id","location_number"]] 
  #compute haversine distance of each vendor against all vendors
  for ind1 in vendor_locations.index:
    start_latit = np.repeat(vendor_locations.loc[ind1,"latitude"],repeats=nbr_customer_locations,axis=0)
    start_longit = np.repeat(vendor_locations.loc[ind1,"longitude"],repeats=nbr_customer_locations,axis=0)
    end_latit = customer_locations["latitude"].values
    end_longit = customer_locations["longitude"].values
    customer_vendor_distances[cols[ind1]] = calculate_haversine(start_longit,start_latit,end_longit,end_latit)
  return customer_vendor_distances

In [None]:
def unpivot_data(df):
  #reorganize multiple vendor cols into single vendor column
  df_melt = pd.melt(df, 
            id_vars=['customer_id','location_number'],
            value_vars=list(df.columns[2:]), # list of days of the week
            var_name='vendor_id', 
            value_name='target')

  df_melt["vendor_id"] = df_melt["vendor_id"].str.split("vendor_id_").str[1]
  return df_melt

In [None]:
def predict_cosine_neighbours(nearest_neighbours,train_custvend_distances, test_custvend_distances, train_custlocn_vendor_order, out_cols):
  
  similarity_scores = cosine_similarity(test_custvend_distances.iloc[:,2:], Y=train_custvend_distances.iloc[:,2:], dense_output=True) #output will be (test.shape,train.shape)
  
  #determine the indices of top 'k' nearest train customers
  similar_customerlocn_indices = np.argpartition(similarity_scores, kth = -nearest_neighbours, axis=-1)[:,-nearest_neighbours:] #for every test customer (axis=-1) take the top21 similar train customers
    
  flatind = similar_customerlocn_indices.ravel()
  test_out_vendor = pd.DataFrame(columns = out_cols) 
  test_out_vendor[["customer_id","location_number"]] = test_custvend_distances[["customer_id","location_number"]]
  
  #for every vendor
  for vend in ordered_vendor_cols:
    #obtain the label corresponding to the nearest neighbours that has the maximum votes
    vend_y = mode(train_custlocn_vendor_order.loc[flatind,vend].values.reshape(similar_customerlocn_indices.shape[0],-1),axis=1)[0]
    test_out_vendor[vend] = vend_y

  test_out_vendor_melt = unpivot_data(test_out_vendor)
  return test_out_vendor_melt

In [None]:
input_location = os.path.join(os.getcwd()
output_location = "/content/drive/MyDrive/collab_similarity/"
train_customer_demographics = "train_customers.csv"
test_customer_demographics = "test_customers.csv"
train_customer_locations = "train_locations.csv"
test_customer_locations  = "test_locations.csv"
vendor_file = "vendors.csv"
orders_file = "orders.csv"

customer_similarity_features = ["customer_id","location_number","location_type",'latitude', 'longitude','gender', 'verified', 'language','year_customer_created', 'month_customer_created']
num_cols = ['customer_id','location_number','latitude','longitude']  #'customer_id','location' added as key

In [None]:
#Process and Extract Vendor features
vendors = process_vendors(input_location,vendor_file)
vendor_locations = vendors[['vendor_id','latitude', 'longitude']]
vendors.shape

#Process and Merge customer demographics and customer location data
train_customers = process_customer_demo(input_location,train_customer_demographics)
train_locations = process_customer_location(input_location,train_customer_locations)
train_location_demograph  = merge_demo_loc(train_customers,train_locations)       

print(train_customers.shape)
print(train_locations.shape)
print(train_location_demograph.shape)

(100, 16)

For every customer-location, indicate orders presence for all 100 

Matrix of customer and vendor outcomes

In [None]:
#Create matrix of customer-vendor targets (to be used after determining the nearest neighbours)
custlocn_vendor_order = pd.read_csv(os.path.join(os.getcwd(), orders_file))
custlocn_vendor_order.drop_duplicates(subset=['CID X LOC_NUM X VENDOR'],inplace=True)
custlocn_vendor_order = custlocn_vendor_order[["customer_id","LOCATION_NUMBER","vendor_id"]]
custlocn_vendor_order = pd.get_dummies(custlocn_vendor_order, columns=['vendor_id']).reset_index(drop=True)
custlocn_vendor_order.drop_duplicates(subset = ["customer_id","LOCATION_NUMBER"],inplace=True)
custlocn_vendor_order.rename(columns={"LOCATION_NUMBER":"location_number"},inplace=True)
print(custlocn_vendor_order.shape)
custlocn_vendor_order.head()

(43641, 102)


Unnamed: 0,customer_id,location_number,vendor_id_4,vendor_id_13,vendor_id_20,vendor_id_23,vendor_id_28,vendor_id_33,vendor_id_43,vendor_id_44,vendor_id_55,vendor_id_66,vendor_id_67,vendor_id_75,vendor_id_76,vendor_id_78,vendor_id_79,vendor_id_81,vendor_id_82,vendor_id_83,vendor_id_84,vendor_id_85,vendor_id_86,vendor_id_90,vendor_id_92,vendor_id_104,vendor_id_105,vendor_id_106,vendor_id_110,vendor_id_113,vendor_id_115,vendor_id_134,vendor_id_145,vendor_id_148,vendor_id_149,vendor_id_154,vendor_id_157,vendor_id_159,vendor_id_160,vendor_id_161,vendor_id_176,vendor_id_180,vendor_id_188,vendor_id_189,vendor_id_191,vendor_id_192,vendor_id_193,vendor_id_195,vendor_id_196,vendor_id_197,vendor_id_199,vendor_id_201,vendor_id_203,vendor_id_207,vendor_id_216,vendor_id_221,vendor_id_225,vendor_id_231,vendor_id_237,vendor_id_243,vendor_id_250,vendor_id_259,vendor_id_265,vendor_id_271,vendor_id_274,vendor_id_288,vendor_id_289,vendor_id_294,vendor_id_295,vendor_id_298,vendor_id_299,vendor_id_300,vendor_id_303,vendor_id_304,vendor_id_310,vendor_id_356,vendor_id_386,vendor_id_391,vendor_id_398,vendor_id_401,vendor_id_419,vendor_id_459,vendor_id_537,vendor_id_547,vendor_id_573,vendor_id_575,vendor_id_577,vendor_id_578,vendor_id_582,vendor_id_583,vendor_id_676,vendor_id_679,vendor_id_681,vendor_id_841,vendor_id_843,vendor_id_845,vendor_id_846,vendor_id_849,vendor_id_855,vendor_id_856,vendor_id_858,vendor_id_907
0,92PEE24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,QS68UD8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,MB7VY5F,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,KDJ951Y,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,BAL0RVT,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


For every customer-location find the haversine distance to all the 100 vendors

In [None]:
train_custvend_distances = calculate_vendor_distances(train_locations[['customer_id', 'location_number', 'latitude', 'longitude']],vendor_locations)
print(train_custvend_distances.shape)
train_custvend_distances.head()

(59503, 102)
(16720, 102)


Unnamed: 0,customer_id,location_number,4,13,20,23,28,33,43,44,55,66,67,75,76,78,79,81,82,83,84,85,86,90,92,104,105,106,110,113,115,134,145,148,149,154,157,159,160,161,176,180,188,189,191,192,193,195,196,197,199,201,203,207,216,221,225,231,237,243,250,259,265,271,274,288,289,294,295,298,299,300,303,304,310,356,386,391,398,401,419,459,537,547,573,575,577,578,582,583,676,679,681,841,843,845,846,849,855,856,858,907
0,02SFNJH,0,8847.430953,8845.913268,8834.488717,8847.350413,8821.455686,8845.865539,8822.630135,8773.95283,8777.217425,8822.789963,8816.689597,8847.636648,8824.817927,8785.290479,8818.130448,8771.072032,8834.763348,8781.029056,8773.850365,8847.63046,8769.716532,8774.301566,8845.950526,8834.542115,8773.946395,8818.173231,8822.408618,8818.121934,8818.41631,8832.026657,8771.757628,8771.660123,8820.26732,8775.660652,8820.729297,8847.671931,8773.88021,8750.486091,8836.700997,8771.032559,8834.562341,8774.305525,8834.300089,8840.894719,8757.06123,8770.33236,8768.894515,8783.807455,8832.540668,8820.59941,8822.89387,8773.831311,8847.400708,8822.784705,8774.571212,6818.544486,8773.952488,8834.450085,8847.590684,8822.784705,8757.332062,8843.421823,8822.804606,8845.804461,8773.913288,8769.310125,8846.009683,8834.500788,8779.432523,8774.527481,8815.147457,8769.240638,8818.812427,8771.959588,8773.825912,8844.300909,8774.769562,8822.370624,8767.608912,8758.147977,8769.210668,8828.684776,8838.872727,8771.6789,8788.600672,8818.83861,8815.809204,8774.222306,8825.383064,8818.213856,8774.241171,8845.525755,8775.245211,8782.13683,8774.127012,8759.951562,8838.073276,8814.492856,8826.749558,8594.804073
1,02SFNJH,1,252.164133,239.169892,232.429981,251.80742,135.377332,241.72859,201.005308,300.654712,325.297604,134.39054,209.110798,253.47837,159.61456,256.437622,119.155744,277.143001,233.488848,298.001022,308.102997,253.450597,254.604863,264.292561,241.453473,190.719924,304.044087,120.896195,186.439244,119.650611,125.185945,193.547575,265.537271,252.37565,191.644923,293.219745,182.34407,253.416182,300.393751,301.881184,235.934317,314.675252,232.218085,291.922508,193.804767,2.773773,293.550074,276.54631,394.612354,265.048627,73.615197,186.813361,185.334418,300.729554,252.053301,185.391057,287.354571,14676.030506,301.399552,59.950178,243.598875,185.391057,297.219071,223.933674,154.482537,248.81788,308.354794,338.251717,241.354899,233.281712,286.501092,286.471095,188.742377,337.777361,190.656572,291.26661,307.806374,256.217107,266.750443,188.092063,252.949941,311.424474,257.494798,182.14742,37.465851,266.075846,277.404091,127.865111,209.770588,263.257431,200.816555,121.38078,263.688269,241.897637,336.569681,210.04293,247.235985,374.92172,51.881119,161.841306,185.586829,1318.835322
2,02SFNJH,2,14.401647,9.728939,10.318676,14.105371,109.552654,9.136977,44.542556,80.745198,97.061827,110.173832,40.095201,15.455384,84.475337,52.076705,126.895086,70.581598,9.217768,74.577715,85.812345,15.431422,67.072953,63.873378,9.247769,51.843498,82.978891,125.125013,58.682675,126.398713,120.722909,49.295128,66.489751,65.098416,54.447309,75.004007,63.263761,15.43552,80.62338,97.619581,6.510901,92.068036,10.507971,75.183878,48.779568,245.071282,88.71416,70.980622,160.708218,55.443948,315.422328,58.972566,59.59854,80.870602,14.305468,59.579173,72.477833,14530.963477,81.228017,301.877939,10.888311,59.579173,90.260914,19.651577,90.10371,11.058487,85.951609,111.077017,9.317917,9.486098,68.478888,72.047866,59.511819,110.722046,56.002851,76.481067,85.6201,15.713369,64.030149,57.086228,69.149572,97.624899,67.72648,61.213949,279.744427,66.681693,56.762918,117.889617,40.064646,63.731486,43.704492,124.622695,63.801417,8.785462,107.173045,68.164708,62.781773,145.667357,294.130695,85.715408,58.237419,1555.739902
3,RU43CXC,0,84.344361,72.198427,60.973597,83.990952,44.109685,74.404691,26.829867,121.965622,146.146724,45.247726,31.880913,85.608997,23.15356,77.769157,60.176857,99.526264,62.060661,118.923319,129.288599,85.581486,78.885419,86.832348,74.203723,28.362611,125.291839,58.417373,15.374232,59.677526,54.088552,27.770353,88.458697,76.38508,17.393111,114.547607,11.770679,85.567916,121.715431,126.312232,65.122303,135.939004,60.811403,113.396935,29.812698,181.720352,117.334455,99.073143,215.369807,86.303509,250.423703,14.064928,15.191291,122.048314,84.232343,15.122428,108.923094,14548.243535,122.695963,237.150371,76.84274,15.122428,120.717352,58.038347,26.464477,80.65685,129.532335,159.292175,74.145309,61.75927,107.67025,108.068859,11.657538,158.826088,15.602311,112.990973,128.998341,86.75844,89.079479,16.368555,77.992055,134.064932,81.613286,19.465002,215.701742,88.977016,98.206833,51.395679,32.255273,85.871524,28.276705,57.926536,86.273578,74.395187,157.408885,37.420445,71.130347,195.999571,229.866947,17.494183,18.852236,1486.208821
4,BDFBPRD,0,346.015485,333.006489,326.026457,345.657927,228.003748,335.563513,294.086647,391.457861,416.602749,227.125205,301.899617,347.331689,252.657446,347.485349,211.235545,367.357192,327.093427,389.260721,398.995878,347.303853,344.23749,354.551561,335.289695,284.297807,394.89158,213.009241,279.453472,211.737538,217.391507,287.036423,355.594255,342.148378,284.557602,384.046068,275.239174,347.269948,391.18818,390.78453,329.590907,405.457475,325.816563,382.630558,287.37624,91.228024,382.864314,366.686774,486.06708,356.100468,20.953366,279.722728,278.371823,391.524988,345.904328,278.422477,378.016295,14723.854671,392.212631,34.25318,337.455598,278.422477,386.636401,317.73667,247.361384,342.649222,399.255193,429.204933,335.191985,326.878854,377.512237,377.116352,281.308654,428.720847,283.476107,381.786586,398.69381,350.02364,357.094461,281.110935,342.331254,401.19016,347.143936,275.476273,56.459671,356.136286,368.947,220.153255,302.511125,353.490279,294.033721,213.506202,353.930794,335.727645,427.867514,299.899489,337.135837,465.711268,42.042356,254.11853,278.83029,1225.969583


In [None]:
#merge customer_vendor_distances and customer_vendor_order to synchornize their indices
train_custvend_distances = pd.merge(train_custvend_distances, custlocn_vendor_order, on=["customer_id","location_number"], how="left")

#fill nulls for those vendors whose entries for customer locations are not present in the order table
train_custvend_distances.fillna(0,inplace=True)

#Split data to train and cv
train_custvend_distances, cv_custvend_distances = train_test_split(train_custvend_distances,test_size=0.15)
print(train_custvend_distances.shape,cv_custvend_distances.shape)

#vendor_id columns from order table
ordered_vendor_cols = list(custlocn_vendor_order.iloc[:,2:].columns)
out_cols = list(custlocn_vendor_order.columns)

(50577, 202) (8926, 202)


In [None]:
#separate customer_vendor_distances and customer_vendor_order for further processing
train_custlocn_vendor_order = train_custvend_distances[["customer_id","location_number"] + ordered_vendor_cols]
train_custvend_distances = train_custvend_distances.loc[:,~train_custvend_distances.columns.isin(ordered_vendor_cols)]
train_custvend_distances.reset_index(drop=True,inplace=True)
train_custlocn_vendor_order.reset_index(drop=True,inplace=True)

#separate customer_vendor_distances and customer_vendor_order for further processing
cv_custlocn_vendor_order = cv_custvend_distances[["customer_id","location_number"] + ordered_vendor_cols]
cv_custvend_distances = cv_custvend_distances.loc[:,~cv_custvend_distances.columns.isin(ordered_vendor_cols)]
cv_custvend_distances.reset_index(drop=True,inplace=True)
cv_custlocn_vendor_order.reset_index(drop=True,inplace=True)

print(train_custlocn_vendor_order.shape)
print(train_custvend_distances.shape)
print(cv_custlocn_vendor_order.shape)
print(cv_custvend_distances.shape)
train_custlocn_vendor_order.head()

(50577, 102)
(50577, 102)
(8926, 102)
(8926, 102)


Unnamed: 0,customer_id,location_number,vendor_id_4,vendor_id_13,vendor_id_20,vendor_id_23,vendor_id_28,vendor_id_33,vendor_id_43,vendor_id_44,vendor_id_55,vendor_id_66,vendor_id_67,vendor_id_75,vendor_id_76,vendor_id_78,vendor_id_79,vendor_id_81,vendor_id_82,vendor_id_83,vendor_id_84,vendor_id_85,vendor_id_86,vendor_id_90,vendor_id_92,vendor_id_104,vendor_id_105,vendor_id_106,vendor_id_110,vendor_id_113,vendor_id_115,vendor_id_134,vendor_id_145,vendor_id_148,vendor_id_149,vendor_id_154,vendor_id_157,vendor_id_159,vendor_id_160,vendor_id_161,vendor_id_176,vendor_id_180,vendor_id_188,vendor_id_189,vendor_id_191,vendor_id_192,vendor_id_193,vendor_id_195,vendor_id_196,vendor_id_197,vendor_id_199,vendor_id_201,vendor_id_203,vendor_id_207,vendor_id_216,vendor_id_221,vendor_id_225,vendor_id_231,vendor_id_237,vendor_id_243,vendor_id_250,vendor_id_259,vendor_id_265,vendor_id_271,vendor_id_274,vendor_id_288,vendor_id_289,vendor_id_294,vendor_id_295,vendor_id_298,vendor_id_299,vendor_id_300,vendor_id_303,vendor_id_304,vendor_id_310,vendor_id_356,vendor_id_386,vendor_id_391,vendor_id_398,vendor_id_401,vendor_id_419,vendor_id_459,vendor_id_537,vendor_id_547,vendor_id_573,vendor_id_575,vendor_id_577,vendor_id_578,vendor_id_582,vendor_id_583,vendor_id_676,vendor_id_679,vendor_id_681,vendor_id_841,vendor_id_843,vendor_id_845,vendor_id_846,vendor_id_849,vendor_id_855,vendor_id_856,vendor_id_858,vendor_id_907
0,TJJ7XD9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,X73HY1W,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HVY0FVW,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,KFJ3KUM,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,T2IY8Y9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Take cosine similarity betweeen train and test distance matrices**

*Reference for argpartition*

https://stackoverflow.com/questions/52465066/how-does-numpys-argpartition-work-on-the-documentations-example

In [None]:
#Hyperparameter Search for number of nearest neighbours
nn_list = [3,5,7,9,11,13,15,17,19,21,23]
train_out_true = unpivot_data(train_custlocn_vendor_order)
cv_out_true    = unpivot_data(cv_custlocn_vendor_order)
f1_dict = dict()

for nn in nn_list:
  cv_out_pred = predict_cosine_neighbours(nn, train_custvend_distances,cv_custvend_distances,train_custlocn_vendor_order,out_cols)
  f1_cv = f1_score(cv_out_true['target'].values, cv_out_pred['target'].values,average='macro')
  f1_dict[nn] = f1_cv
  print("F1 score for {} nearest neighbours is {}".format(nn,f1_cv))

F1 score for 3 nearest neighbours is 0.5261658721950357
F1 score for 5 nearest neighbours is 0.5178769180882968
F1 score for 7 nearest neighbours is 0.5117666207972918
F1 score for 9 nearest neighbours is 0.5091393295822535
F1 score for 11 nearest neighbours is 0.5069411817429518
F1 score for 13 nearest neighbours is 0.5055015013959512
F1 score for 15 nearest neighbours is 0.5046195876008102
F1 score for 17 nearest neighbours is 0.5043310724907547
F1 score for 19 nearest neighbours is 0.5037432627721101
F1 score for 21 nearest neighbours is 0.5032982666302791
F1 score for 23 nearest neighbours is 0.5032982666302791


In [None]:
best_nn = sorted(f1_dict.items(), key=lambda x:x[1],reverse=True)[0][0]
cv_out_pred = predict_cosine_neighbours(best_nn, train_custvend_distances,cv_custvend_distances,train_custlocn_vendor_order,out_cols)
f1_cv = f1_score(cv_out_true["target"], cv_out_pred["target"],average='macro')
print("F1 score for {} nearest neighbours is {}".format(best_nn,f1_cv))

F1 score for 3 nearest neighbours is 0.5261658721950357


In [None]:
#generating submission file for the test customers
test_customers = process_customer_demo(input_location,test_customer_demographics)
test_locations = process_customer_location(input_location,test_customer_locations)
test_location_demograph  = merge_demo_loc(test_customers,test_locations) 
test_custvend_distances = calculate_vendor_distances(test_locations[['customer_id', 'location_number', 'latitude', 'longitude']],vendor_locations)
test_out_pred = predict_cosine_neighbours(nn, train_custvend_distances,test_custvend_distances,train_custlocn_vendor_order,out_cols)

print(test_customers.shape)
print(test_locations.shape)
print(test_location_demograph.shape)
print(test_custvend_distances.shape)
print(test_out_pred.shape)

(9753, 4)
(16720, 5)
(16720, 8)
(16720, 102)
(1672000, 4)
