### Feature engineering using orders and vendors information ###

During our initial analysis we decided to remove many columns from train_full. Here we will be adding in columns based on vendor and order information that matches with the customer and vendor in question. 

Below is a list of features we want to add: 
- Has ordered here before: binary (orders.csv) 
- Grand total: Average across customer’s order from vendor. (orders.csv) 
- Is_favorite: Set missing equal to 0, 1 otherwise. (orders.csv) 
- Distance between customer location and vendor location (train_full) 
- Number of times customer has ordered from this vendor tag (orders.csv) 
- One-hot encoding of vendor tag (train_full/vendors.csv) 
- Is_rated and vendor_rating (orders.csv) 

In [2]:
import pandas as pd
import numpy as np

# Read in CSV
orders = pd.read_csv('../data/orders.csv')
vendors = pd.read_csv('../data/vendors.csv')
sm_train = pd.read_csv('../data/sm_train_full.csv')
lg_train = pd.read_csv('../data/lg_train_full.csv')

  orders = pd.read_csv('../data/orders.csv')


Removing columns dropped from training datasets.

In [3]:
# Gathering the list of columns we removed from train_full.csv
columns = ['is_open', 'status_y', 'device_type', 'verified_y', 'commission', 'is_akeed_delivering', 'language', 'open_close_flags', 'one_click_vendor', 'country_id', 'city_id', 'display_orders', 'gender', 'location_type', 'OpeningTime', 'OpeningTime2', 'sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2', 'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2', 'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2', 'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2', 'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2', 'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2', 'friday_to_time2', 'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2', 'saturday_to_time2', 'primary_tags', 'status_x', 'verified_x', 'created_at_x', 'updated_at_x', 'authentication_id', 'vendor_category_en', 'rank', 'created_at_y', 'updated_at_y']

# Drop columns from the dataframes if they were removed from train_full.csv
for col in columns:
    if col in orders.columns:
        orders = orders.drop(col, axis=1)
    if col in vendors.columns:
        vendors = vendors.drop(col, axis=1)
    
        
# Drop rows in orders where the location_number != 0
orders = orders[orders['LOCATION_NUMBER'] == 0]

print("Cleaned up dataframes columns")
print(orders.columns)
print(vendors.columns)

temp_sm = sm_train.copy()

print(sm_train.columns)
print(lg_train.columns)

# print(orders['LOCATION_NUMBER'].value_counts())
# print(orders['is_rated'].value_counts())
# print(orders['vendor_rating'].value_counts())

Cleaned up dataframes columns
Index(['akeed_order_id', 'customer_id', 'item_count', 'grand_total',
       'payment_mode', 'promo_code', 'vendor_discount_amount',
       'promo_code_discount_percentage', 'is_favorite', 'is_rated',
       'vendor_rating', 'driver_rating', 'deliverydistance', 'preparationtime',
       'delivery_time', 'order_accepted_time', 'driver_accepted_time',
       'ready_for_pickup_time', 'picked_up_time', 'delivered_time',
       'delivery_date', 'vendor_id', 'created_at', 'LOCATION_NUMBER',
       'LOCATION_TYPE', 'CID X LOC_NUM X VENDOR'],
      dtype='object')
Index(['id', 'latitude', 'longitude', 'vendor_category_id', 'delivery_charge',
       'serving_distance', 'prepration_time', 'discount_percentage', 'status',
       'verified', 'vendor_rating', 'vendor_tag', 'vendor_tag_name',
       'created_at', 'updated_at'],
      dtype='object')
Index(['customer_id', 'location_number', 'latitude_x', 'longitude_x', 'id',
       'latitude_y', 'longitude_y', 'vendor_cat

Feature: Has Ordered Before
- Binary indicated customer has ordered from here before (1) or not (0).

In [4]:
column_name = 'CID X LOC_NUM X VENDOR'

# Ensure the columns are treated as strings
orders[column_name] = orders[column_name].astype(str)
temp_sm[column_name] = temp_sm[column_name].astype(str)

# Add the 'ordered_before' column to orders dataframe and initialize with 0
temp_sm['ordered_before'] = 0

# Iterate through each row in orders dataframe
for index, row in orders.iterrows():
    if row[column_name] in orders[column_name].values:
        temp_sm.at[index, 'ordered_before'] = 1


# Print the first few rows of the updated orders dataframe to verify
print(temp_sm[['customer_id', 'id', 'ordered_before']].head(5))

  customer_id  id  ordered_before
0     TCHWPBT   4               1
1     TCHWPBT  13               1
2     TCHWPBT  28               1
3     TCHWPBT  33               1
4     TCHWPBT  43               1


Feature: Average total by customer and vendor
- Numeric value

In [13]:
# Create new columns
temp_sm['vendor_average'] = 0
temp_sm['customer_average'] = 0

# Calculate the average total cost by vendor and customer
vendor_avg = orders.groupby('vendor_id')['grand_total'].mean().to_dict()
customer_avg = orders.groupby('customer_id')['grand_total'].mean().to_dict()

print(len(customer_avg))

# Add the 'vendor_average' and 'customer_average' columns to sm_train dataframe
temp_sm['vendor_average'] = temp_sm['id'].map(vendor_avg).fillna(0)
temp_sm['customer_average'] = temp_sm['customer_id'].map(customer_avg).fillna(0)

print(temp_sm[['customer_id', 'id', 'vendor_average', 'customer_average']].head(5))

25450
  customer_id  id  vendor_average  customer_average
0     TCHWPBT   4       16.856517               5.4
1     TCHWPBT  13       18.324214               5.4
2     TCHWPBT  28       12.308660               5.4
3     TCHWPBT  33       22.147441               5.4
4     TCHWPBT  43       12.038301               5.4


Feature: Favorite Vendor
- Binary. 1 if favorited, 0 if not or null.

In [16]:
# Add the column to dataframe and initialize with 0
temp_sm['is_favorite'] = 0

# Filter to only include orders that have been rated
fav_orders = orders[orders['is_rated'] == 'Yes']
print(fav_orders.shape)

# Iterate through each row in orders dataframe
for index, row in fav_orders.iterrows():
    if row[column_name] in temp_sm[column_name].values:
        temp_sm.at[index, 'is_favorite'] = 1
        
print(temp_sm[['customer_id', 'id', 'is_favorite']].head(5))

(8003, 26)
  customer_id  id  is_favorite
0     TCHWPBT   4            0
1     TCHWPBT  13            0
2     TCHWPBT  28            0
3     TCHWPBT  33            0
4     TCHWPBT  43            0


Feature: Number of times customer has ordered from vendor

In [17]:
# Add the column to dataframe and initialize with 0
temp_sm['times_ordered'] = 0

# Count number of times customer has ordered from a specific vendor
number_times = orders.groupby(column_name).size().to_dict()

# Map values to dataset
temp_sm['times_ordered'] = temp_sm[column_name].map(number_times).fillna(0).astype(int)

print(temp_sm[['customer_id', 'id', 'times_ordered']].head(5))

target_counts = temp_sm['times_ordered'].value_counts()
print("Counts for number of times ordered:")
print(target_counts.sort_index())

  customer_id  id  times_ordered
0     TCHWPBT   4              0
1     TCHWPBT  13              0
2     TCHWPBT  28              0
3     TCHWPBT  33              0
4     TCHWPBT  43              0
Counts for number of times ordered:
times_ordered
0     423007
1       3175
2        660
3        247
4        114
5         82
6         49
7         20
8         19
9          7
10         6
11         8
12         2
13         2
14         2
15         1
17         2
22         1
51         1
Name: count, dtype: int64


Feature: Vendor Rating
- Binary if a customer has (1) or has not (0) rated the vendor.
- Vendor rating from customer, 0 if missing
- Overall vendor rating from vendors.csv

In [18]:
# Add the columns to dataframe and initialize with 0
temp_sm['is_rated'] = 0
temp_sm['customer_rating'] = 0

# Filter to only include orders that have been rated by customer
rated_orders = orders[orders['is_rated'] == 'Yes']

# Iterate through each row in ranked_orders dataframe
for index, row in rated_orders.iterrows():
    if row[column_name] in temp_sm[column_name].values:
        temp_sm.at[temp_sm[temp_sm[column_name] == row[column_name]].index[0], 'customer_rating'] = row['vendor_rating']
        temp_sm.at[temp_sm[temp_sm[column_name] == row[column_name]].index[0], 'is_rated'] = 1
        
print(temp_sm[['customer_id', 'id', 'customer_rating', 'vendor_rating', 'is_rated']].head(5))

  customer_id  id  customer_rating  vendor_rating  is_rated
0     TCHWPBT   4                0            4.4         0
1     TCHWPBT  13                0            4.7         0
2     TCHWPBT  28                0            4.4         0
3     TCHWPBT  33                0            4.6         0
4     TCHWPBT  43                0            4.3         0


Feature: Distance
- Distance from customer's location to vendor's location
- Serving distance from vendors.csv

_Note: Since the locations have been masked this may need to be normalized to a range to show some relationship._

In [19]:
# Initialize the distance column
temp_sm['distance'] = 0

# Euclidean distance
temp_sm['distance'] = np.sqrt((temp_sm['latitude_x'] - temp_sm['latitude_y'])**2 + (temp_sm['longitude_x'] - temp_sm['longitude_y'])**2)

print("Distance before transofrmation:")
print(temp_sm['distance'].describe())

# Apply a log transformation
temp_sm['distance'] = np.log1p(temp_sm['distance'])

# Calculate the statistics for transformation
distance_min = temp_sm['distance'].min()
distance_max = temp_sm['distance'].max()
distance_mean = temp_sm['distance'].mean()
distance_std = temp_sm['distance'].std()

serving_distance_mean = vendors['serving_distance'].mean()
serving_distance_std = vendors['serving_distance'].std()

# Rescale the distance column to match serving_distance range [2, 15]
temp_sm['distance_scaled'] = 2 + (temp_sm['distance'] - distance_min) / (distance_max - distance_min) * (15 - 2)

# Match mean and std of serving_distance
temp_sm['serving_distance'] = serving_distance_mean + ((temp_sm['distance_scaled'] - temp_sm['distance_scaled'].mean()) / 
                                                       temp_sm['distance_scaled'].std()) * serving_distance_std

# Clip values to the maximum serving distance (optional)
temp_sm['serving_distance'] = temp_sm['serving_distance'].clip(upper=15)

print("Distance column information:")
print(temp_sm['distance_scaled'].describe())

# Final summary statistics of the generated serving_distance column
print("Generated serving_distance column information:")
print(temp_sm['serving_distance'].describe())

# Display the updated DataFrame
print(temp_sm[['customer_id', 'id', 'distance', 'distance_scaled', 'serving_distance']].head(5))


Distance before transofrmation:
count    427405.000000
mean         33.988876
std          41.823155
min           0.000540
25%           0.704063
50%           1.493026
75%          79.047104
max        1044.847837
Name: distance, dtype: float64
Distance column information:
count    427405.000000
mean          6.058568
std           3.519228
min           2.000000
25%           2.995704
50%           3.707188
75%          10.194279
max          15.000000
Name: distance_scaled, dtype: float64
Generated serving_distance column information:
count    427405.000000
mean         11.087608
std           3.321687
min           7.012344
25%           8.174652
50%           9.005183
75%          15.000000
max          15.000000
Name: serving_distance, dtype: float64
  customer_id  id  distance  distance_scaled  serving_distance
0     TCHWPBT   4  4.774880        10.927795              15.0
1     TCHWPBT  13  4.775635        10.929208              15.0
2     TCHWPBT  28  4.781257        10.93972

Feature: Vendor Tags
If a customer has ordered a matching vendor tag to the vendor given. 
The number of matching tags in customers history to current vendor.

In [20]:
# New features
temp_sm['ordered_vendor_tag'] = 0
temp_sm['matching_tags_count'] = 0

vendors['vendor_tag'] = vendors['vendor_tag'].fillna('')  # Replace NaN with empty string

# Create mapping of vendor id to tags
# vendors['vendor_tag'] = vendors['vendor_tag'].apply(lambda x: set(map(int, x.split(','))) if x else set())
vendor_tags = dict(zip(vendors['id'], vendors['vendor_tag']))

# Tags customers have ordered
customer_tags = {}

# For each order, map the vendor's tags to the customer
for idx, row in orders.iterrows():
    customer_id = row['customer_id']
    vendor_id = row['vendor_id']
    if vendor_id in vendor_tags:  # Ensure the vendor exists in vendors_df
        tags = vendor_tags[vendor_id]
        if customer_id not in customer_tags:
            customer_tags[customer_id] = set(tags)
        else:
            customer_tags[customer_id].update(tags)

def calculate_tag_columns(row):
    customer_id = row['customer_id']
    vendor_id = row['id']
    
    # Get the vendor's tags
    vendor_tags_set = vendor_tags.get(vendor_id, set())
    
    # Get the customer's ordered tags
    customer_tags_set = customer_tags.get(customer_id, set())
    
    # Calculate the matching tags
    matching_tags = customer_tags_set.intersection(vendor_tags_set)
    ordered_vendor_tag = 1 if matching_tags else 0
    matching_tags_count = len(matching_tags)
    
    return pd.Series({
        'ordered_vendor_tag': ordered_vendor_tag,
        'matching_tags_count': matching_tags_count
    })

# Apply the function to train_full_df
temp_sm[['ordered_vendor_tag', 'matching_tags_count']] = temp_sm.apply(calculate_tag_columns, axis=1)

print(temp_sm.head())

  customer_id  location_number  latitude_x  longitude_x  id  latitude_y  \
0     TCHWPBT                0      -96.44        -67.2   4     -0.5884   
1     TCHWPBT                0      -96.44        -67.2  13     -0.4717   
2     TCHWPBT                0      -96.44        -67.2  28      0.4807   
3     TCHWPBT                0      -96.44        -67.2  33     -0.4946   
4     TCHWPBT                0      -96.44        -67.2  43     -0.1150   

   longitude_y  vendor_category_id  delivery_charge  serving_distance  ...  \
0       0.7544                 2.0              0.0              15.0  ...   
1       0.7446                 2.0              0.7              15.0  ...   
2       0.5527                 2.0              0.7              15.0  ...   
3       0.7430                 2.0              0.7              15.0  ...   
4       0.5460                 2.0              0.7              15.0  ...   

   vendor_average  customer_average  is_favorite times_ordered is_rated  \
0    

The locations being transformed this way may become an issue, but it now more closely represents the findings in 'serving_distance'

Before saving the dataset we will remove: 'vendor_tag', 'vendor_tag_name', 'CID X LOC_NUM X VENDOR' and make sure out small and large dataset contain the same columns. 

In [21]:
# Drop columns that are no longer needed
final_drop_cols = ['location_number_obj', 'id_obj','vendor_tag', 'vendor_tag_name', 'CID X LOC_NUM X VENDOR', 'location_number']

# Rename id to vendor_id
temp_sm = temp_sm.rename(columns={'id': 'vendor_id'})

for col in final_drop_cols:
    if col in temp_sm.columns:
        temp_sm = temp_sm.drop(col, axis=1)
    
print("Final columns in sm_train dataframe:")
print(temp_sm.columns)
print(temp_sm.shape)

# Save the updated sm_train dataframe to a new CSV file
temp_sm.to_csv('../data/new_sm_train.csv', index=False)

Final columns in sm_train dataframe:
Index(['customer_id', 'latitude_x', 'longitude_x', 'vendor_id', 'latitude_y',
       'longitude_y', 'vendor_category_id', 'delivery_charge',
       'serving_distance', 'prepration_time', 'discount_percentage',
       'vendor_rating', 'target', 'ordered_before', 'vendor_average',
       'customer_average', 'is_favorite', 'times_ordered', 'is_rated',
       'customer_rating', 'distance', 'distance_scaled', 'ordered_vendor_tag',
       'matching_tags_count'],
      dtype='object')
(427405, 24)
