In [1]:
# --------------------------------
import warnings
warnings.filterwarnings('ignore')
# --------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

%matplotlib inline
#loading in datasets
def load_data():
  members = pd.read_csv('membership_info.csv')
  zip_data = pd.read_csv('zip_info.csv')
  volume = pd.read_csv('volume_info.csv')
  products = pd.read_csv('product_usage.csv')
  volume.fillna(0, inplace=True)
  products.fillna(0, inplace=True)
  members['number_of_locations'] = members['number_of_locations'].fillna(1)
  return volume, products, members, zip_data

In [2]:
load_data()

(          nmg_id   volume        date
 0         8310.0   3296.0  2012-12-01
 1         7955.0   1569.0  2012-12-01
 2        20044.0  25974.0  2012-12-01
 3         5235.0   8035.0  2012-12-01
 4       113248.0   6009.0  2012-12-01
 ...          ...      ...         ...
 296855    1686.0    899.0  2019-02-01
 296856    1270.0    172.0  2019-02-01
 296857    1510.0   2009.0  2019-02-01
 296858    4555.0    210.0  2019-02-01
 296859    2232.0   1419.0  2019-02-01
 
 [296860 rows x 3 columns],
               date  nmg_id  credit_card_processing_total  \
 0       2016-03-01    5557                           0.0   
 1       2021-07-01    7499                           0.0   
 2       2021-10-01    2191                           0.0   
 3       2019-04-01  104900                           0.0   
 4       2018-12-01    1495                           0.0   
 ...            ...     ...                           ...   
 181715  2023-05-01  104976                           0.0   
 181716  2022-

In [3]:
members = pd.read_csv('membership_info.csv')
zip_data = pd.read_csv('zip_info.csv')
volume = pd.read_csv('volume_info.csv')
products = pd.read_csv('product_usage.csv')
volume.fillna(0, inplace=True)
products.fillna(0, inplace=True)
members['number_of_locations'] = members['number_of_locations'].fillna(1)

### Data Preparation Function

In [4]:
def prepare_function_data(volume, products, input_date_str):
    volume['date'] = pd.to_datetime(volume['date'])
    products['date'] = pd.to_datetime(products['date'])
    input_date = pd.to_datetime(input_date_str)

    # start dates for trailing 12 month and 12 months previous
    t12_start = input_date - pd.DateOffset(months=12)
    p12_start = t12_start - pd.DateOffset(months=12)

    volume_t12 = volume.query("date > @t12_start & date <= @input_date").groupby('nmg_id')['volume'].sum().reset_index(name='t12_volume')
    volume_p12 = volume.query("date > @p12_start & date <= @t12_start").groupby('nmg_id')['volume'].sum().reset_index(name='p12_volume')

    # filter for T12 in products and sum numeric columns
    products_t12 = products.query("date > @t12_start & date <= @input_date")
    products_t12_sum = products_t12.groupby('nmg_id').agg({col: 'sum' for col in products_t12 if col not in ['date', 'nmg_id']}).reset_index()

    final_df = volume_t12.set_index('nmg_id').join(volume_p12.set_index('nmg_id'), on='nmg_id', how='outer', rsuffix='_p12')
    final_df = final_df.join(products_t12_sum.set_index('nmg_id'), on='nmg_id', how='outer')

    #  volume growth calculation
    final_df['t12_volume_growth'] = (final_df['t12_volume'] - final_df.get('p12_volume', 0))

    # filter out useless data points that will hurt knn model
    final_df_filtered = final_df[
        (final_df['t12_volume'] > 0) &
        (final_df['p12_volume'] > 0) &
        (final_df['t12_volume_growth'] / final_df['p12_volume'] <= 0.5)
    ].reset_index()

    # columns to include 't12_' prefix as necessary
    for col in ['credit_card_processing_total', 'inventory_finance_total', 'lease_to_own_total', 'product_protection_total', 'retail_credit_total']:
        if col in final_df_filtered.columns:
            final_df_filtered.rename(columns={col: f't12_{col}'}, inplace=True)
    final_df_filtered['volume_growth']= (final_df_filtered['t12_volume']-final_df_filtered['p12_volume'])/final_df_filtered['p12_volume']
    final_df_filtered = final_df_filtered.fillna(0)
    final_df_filtered = final_df_filtered.drop('t12_volume_growth',axis=1)
    # joining everything
    final_with_members = final_df_filtered.reset_index().merge(members, on='nmg_id', how='outer')

    members['zip_code'] = members['zip_code'].astype(str)
    zip_data['geo_id'] = zip_data['geo_id'].astype(str)

    final_with_zip = final_with_members.merge(zip_data, left_on='zip_code', right_on='geo_id', how='outer')

    final_with_zip = final_with_zip.fillna(0)
    final_with_zip = final_with_zip.drop(['index', 'level_0', 'geo_id'], axis = 1)
    def categorize_locations(value):
      if value == 1:
          return "Single Retailer"
      elif value < 5:
          return "Medium-sized Retailer"
      else:
         return "Large Retailer"

    final_with_zip['number_of_locations'] = final_with_zip['number_of_locations'].apply(categorize_locations)

    return final_with_zip


## Data Prep Function Usage

In [5]:
function_data = prepare_function_data(volume, products, '2024-01-01')

In [6]:
# volume['nmg_id']

In [35]:
function_data.head(20)

Unnamed: 0,nmg_id,t12_volume,p12_volume,t12_credit_card_processing_total,t12_inventory_finance_total,t12_lease_to_own_total,t12_product_protection_total,t12_retail_credit_total,volume_growth,company_name,industry_name,number_of_locations,zip_code,median_income,households,poverty_rate,total_pop,vacant_housing_rate,income_per_capita
0,1000.0,171197.83,277735.6,0.0,1027568.42,0.0,195782.19,96131.18,-0.383594,A-1 HOME APPLIANCE CENTER,Appliances,Medium-sized Retailer,70058,49551.0,13844.0,0.51423,40191.0,0.081841,27419.0
1,1002.0,36097.56,89740.93,0.0,2050776.48,0.0,66616.09,178371.82,-0.597758,BOB WALLACE APPLIANCE,Appliances,Medium-sized Retailer,35805,26598.0,8954.0,0.744025,19761.0,0.120281,17848.0
2,300157.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Haley's Flooring & Interiors (Inc),Home Improvement,Single Retailer,35805,26598.0,8954.0,0.744025,19761.0,0.120281,17848.0
3,1015.0,83984.05,171957.92,7895392.69,5382410.69,0.0,24085.39,314974.76,-0.511601,"STANTONS APPLIANCE SALES AND SERVICE, INC",Appliances,Medium-sized Retailer,70815,50862.0,10629.0,0.370214,30124.0,0.154859,26062.0
4,2594.0,121405.18,423310.06,0.0,0.0,0.0,0.0,0.0,-0.7132,ShoppersChoice.com,Appliances,Single Retailer,70815,50862.0,10629.0,0.370214,30124.0,0.154859,26062.0
5,8289.0,8986.9,8540.8,407171.13,0.0,0.0,2598.93,20569.71,0.052232,SHERWOOD TV AND APPLIANCE,Appliances,Single Retailer,70815,50862.0,10629.0,0.370214,30124.0,0.154859,26062.0
6,8498.0,1195.0,21611.0,0.0,0.0,13029.0,329.75,128835.11,-0.944704,"Furniture Expo, LLC",Furniture,Medium-sized Retailer,70815,50862.0,10629.0,0.370214,30124.0,0.154859,26062.0
7,1017.0,65057.87,173423.22,0.0,1421828.7,0.0,102347.08,185408.73,-0.624861,OLIVER DYERS APPLIANCE,Appliances,Single Retailer,76116,49174.0,22025.0,0.453439,50371.0,0.158638,32198.0
8,1018.0,156362.65,336598.91,0.0,1570385.42,0.0,7840.3,0.0,-0.535463,ALABAMA POWER,Appliances,Large Retailer,35203,25030.0,1595.0,0.484639,3026.0,0.186834,33190.0
9,8647.0,1124.3,5507.83,0.0,0.0,51243.95,2729.98,0.0,-0.795872,LEWIES APPLIANCE REPAIR,Appliances,Medium-sized Retailer,35203,25030.0,1595.0,0.484639,3026.0,0.186834,33190.0


### Initial Basic KNN Model


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

def find_top_growing_neighbors(df, input_nmg_id, n_neighbors=50, top_n=10, volume_weight=10):
    if 'nmg_id' not in df.columns:
        raise KeyError("'nmg_id' column not found in DataFrame.")
    df['nmg_id'] = df['nmg_id'].astype(str)

    # Retrieve the industry of the input company
    input_industry = df.loc[df['nmg_id'] == input_nmg_id, 'industry_name'].values
    if input_industry.size == 0:
        raise ValueError(f"No company found with nmg_id: {input_nmg_id}")

    # Filter the DataFrame for companies in the same industry
    df = df[df['industry_name'] == input_industry[0]]

    numeric_features = ['p12_volume', 'median_income', 'households', 'poverty_rate', 'total_pop', 'vacant_housing_rate', 'income_per_capita']
    categorical_features = ['number_of_locations']

    # Adjusting preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Apply preprocessing
    X = preprocessor.fit_transform(df[numeric_features + categorical_features])

    # Adjust the weight of p12_volume
    p12_volume_index = numeric_features.index('p12_volume')
    X[:, p12_volume_index] *= volume_weight

    # Fit Nearest Neighbors
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    nn.fit(X)

    input_index = df.index[df['nmg_id'] == input_nmg_id].tolist()
    if not input_index:
        raise ValueError(f"No company found with nmg_id: {input_nmg_id}")

    # Find nearest neighbors
    distances, indices = nn.kneighbors([X[input_index[0]]])
    nearest_indices = indices[0][1:]  # Exclude the first index since it's the input itself

    neighbors_df = df.iloc[nearest_indices].copy()
    top_growing_neighbors = neighbors_df.sort_values(by='volume_growth', ascending=False).head(top_n)

    columns_to_display = ['nmg_id', 'company_name', 'p12_volume', 'industry_name', 'number_of_locations', 'median_income', 'total_pop', 'volume_growth']
    return top_growing_neighbors[columns_to_display].reset_index(drop=True)


In [32]:
function_data['industry_name'][function_data['industry_name']!='Appliances']

Index([    2,     6,    11,    14,    16,    20,    21,    26,    30,    31,
       ...
       35860, 35861, 35862, 35863, 35864, 35865, 35866, 35867, 35868, 35869],
      dtype='int64', length=33774)

In [34]:
find_top_growing_neighbors(function_data, '105252.0')

Unnamed: 0,nmg_id,poverty_rate,company_name,t12_volume,industry_name,number_of_locations,median_income,total_pop,volume_growth
0,50508.0,0.256958,"CMM, Inc.",2187.0,Appliances,Single Retailer,44525.0,8610.0,0.366875
1,101843.0,0.127698,"SCOTT'S SALES & SERVICE, LLC",1715.2,Appliances,Single Retailer,47845.0,2495.0,0.331677
2,101431.0,0.188272,JOKERS WILD INC,4147.6,Appliances,Single Retailer,36652.0,4383.0,0.299781
3,101793.0,0.133918,PARISI APPLIANCE HOUSE,3289.0,Appliances,Single Retailer,71033.0,4361.0,0.284766
4,106728.0,0.254797,SIGOURNEY TV & APPLIANCE,3440.0,Appliances,Single Retailer,52969.0,2966.0,0.111111
5,7829.0,0.314583,BARRY ELECTRIC,1748.65,Appliances,Single Retailer,38838.0,4597.0,0.088146
6,2781.0,0.294301,Frnka Corp. A/C Sales & Appliances,0.0,Appliances,Single Retailer,55775.0,6698.0,0.0
7,101720.0,0.280776,MEMPHIS MERCANTILE APPLIANCES,0.0,Appliances,Single Retailer,46983.0,3380.0,0.0
8,50410.0,0.332384,"R & M Service Center, LLC",0.0,Appliances,Single Retailer,37271.0,2114.0,0.0
9,7163.0,0.230122,WILSON APPL SLS & SVC,0.0,Appliances,Single Retailer,55196.0,6099.0,0.0


In [10]:
columns_to_keep = [
    'nmg_id',
    'company_name',
    't12_volume',
    'industry_name',
    'number_of_locations',
    'median_income',
    'total_pop',
    'volume_growth']


filtered_data = function_data[function_data['nmg_id'] == "202317.0"][columns_to_keep]

# the initial nmg_id
filtered_data


Unnamed: 0,nmg_id,company_name,t12_volume,industry_name,number_of_locations,median_income,total_pop,volume_growth
888,202317.0,"46 Solutions, Inc",0.0,Electronics,Single Retailer,79977.0,42432.0,0.0


### Understanding Product Usage of Comparable Retailers

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

def evaluate_product_usage(df):
    product_columns = [
        't12_credit_card_processing_total',
        't12_inventory_finance_total',
        't12_lease_to_own_total',
        't12_product_protection_total',
        't12_retail_credit_total'
    ]
    usage_columns = [
        'credit_card_processing',
        'inventory_finance',
        'lease_to_own',
        'product_protection',
        'retail_credit'
    ]
    for usage_col in usage_columns:
        df[usage_col] = 'No'
    for product_col, usage_col in zip(product_columns, usage_columns):
        df[usage_col] = df[product_col].apply(lambda x: 'Yes' if x > 0 else 'No')
    reordered_columns = ['nmg_id']
    for product_col, usage_col in zip(product_columns, usage_columns):
        reordered_columns.extend([usage_col, product_col])
    additional_columns = [col for col in df.columns if col not in reordered_columns and col not in product_columns]
    reordered_columns.extend(additional_columns)
    return df[reordered_columns]

def find_top_growing_neighbors(df, input_nmg_id, n_neighbors=50, top_n=10, volume_weight=10):
    if 'nmg_id' not in df.columns:
        raise KeyError("'nmg_id' column not found in DataFrame.")
    df['nmg_id'] = df['nmg_id'].astype(str)

    # Retrieve the industry of the input company
    input_industry = df.loc[df['nmg_id'] == input_nmg_id, 'industry_name'].values
    if input_industry.size == 0:
        raise ValueError(f"No company found with nmg_id: {input_nmg_id}")

    # Filter the DataFrame for companies in the same industry
    df = df[df['industry_name'] == input_industry[0]]

    numeric_features = ['p12_volume', 'median_income', 'households', 'poverty_rate', 'total_pop', 'vacant_housing_rate', 'income_per_capita']
    categorical_features = ['number_of_locations']

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Apply preprocessing
    X = preprocessor.fit_transform(df[numeric_features + categorical_features])

    # Adjust the weight of p12_volume
    p12_volume_index = numeric_features.index('p12_volume')
    X[:, p12_volume_index] *= volume_weight

    # Fit Nearest Neighbors
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    nn.fit(X)

    input_index = df.index[df['nmg_id'] == input_nmg_id].tolist()
    if not input_index:
        raise ValueError(f"No company found with nmg_id: {input_nmg_id}")

    # Find nearest neighbors
    distances, indices = nn.kneighbors([X[input_index[0]]])
    nearest_indices = indices[0][1:]  # Exclude the first index since it's the input itself

    neighbors_df = df.iloc[nearest_indices].copy()
    top_growing_neighbors = neighbors_df.sort_values(by='volume_growth', ascending=False).head(top_n)

    # Apply product usage evaluation to the top growing comparable retailers
    top_growing_neighbors_with_product_usage = evaluate_product_usage(top_growing_neighbors)

    columns_to_display = [
        'nmg_id',
        'credit_card_processing',
        't12_credit_card_processing_total',
        'inventory_finance',
        't12_inventory_finance_total',
        'lease_to_own',
        't12_lease_to_own_total',
        'product_protection',
        't12_product_protection_total',
        'retail_credit',
        't12_retail_credit_total'
    ]

    return top_growing_neighbors_with_product_usage[columns_to_display].reset_index(drop=True)


In [12]:
top_neighbors_with_product_usage = find_top_growing_neighbors(function_data, "50100.0")

top_neighbors_with_product_usage.head(10)

Unnamed: 0,nmg_id,credit_card_processing,t12_credit_card_processing_total,inventory_finance,t12_inventory_finance_total,lease_to_own,t12_lease_to_own_total,product_protection,t12_product_protection_total,retail_credit,t12_retail_credit_total
0,7624.0,No,0.0,Yes,69444.5,No,0.0,Yes,820.93,Yes,16340.63
1,9466.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
2,60017.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
3,60018.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
4,9246.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
5,6133.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
6,114421.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
7,7075.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
8,60042.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0
9,60055.0,No,0.0,No,0.0,No,0.0,No,0.0,No,0.0


In [22]:
def find_top_growing_neighbors_same_industry(df, input_nmg_id, n_neighbors=50, top_n=5, volume_weight=10):
    if 'nmg_id' not in df.columns:
        raise KeyError("'nmg_id' column not found in DataFrame.")
    df['nmg_id'] = df['nmg_id'].astype(str)

    # Convert specified categorical columns to strings
    categorical_features = ['industry_name', 'number_of_locations']
    for feature in categorical_features:
        df[feature] = df[feature].astype(str)

    numeric_features = ['p12_volume', 'median_income', 'households', 'poverty_rate', 'total_pop', 'vacant_housing_rate', 'income_per_capita']
    features_to_use = numeric_features + categorical_features

    # Adjusting preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))])

    # Preprocessing for numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Apply preprocessing
    X = preprocessor.fit_transform(df[features_to_use])

    # Adjust the weight of t12_volume
    t12_volume_index = features_to_use.index('p12_volume')  # Get the correct column index for t12_volume
    X[:, t12_volume_index] *= volume_weight

    # Fit Nearest Neighbors
    nn = NearestNeighbors(n_neighbors=n_neighbors)
    nn.fit(X)

    input_index = df.index[df['nmg_id'] == input_nmg_id].tolist()
    if not input_index:
        raise ValueError(f"No company found with nmg_id: {input_nmg_id}")

    input_industry = df.loc[input_index[0], 'industry_name']

    # Find nearest neighbors
    distances, indices = nn.kneighbors([X[input_index[0]]])
    nearest_indices = indices[0][1:]  # Exclude the first index since it's the input itself

    # Filter neighbors by industry
    neighbors_df = df.iloc[nearest_indices].copy()
    same_industry_neighbors = neighbors_df[neighbors_df['industry_name'] == input_industry]

    if len(same_industry_neighbors) < top_n:
        return "Not enough neighbors in the same industry."

    top_growing_neighbors = same_industry_neighbors.sort_values(by='volume_growth', ascending=False).head(top_n)

    columns_to_display = ['nmg_id','poverty_rate', 'company_name', 't12_volume', 'industry_name', 'number_of_locations', 'median_income', 'total_pop', 'volume_growth']
    return top_growing_neighbors[columns_to_display].reset_index(drop=True)


In [25]:
output = find_top_growing_neighbors_same_industry(function_data, input_nmg_id="50100.0")

In [26]:
output

Unnamed: 0,nmg_id,poverty_rate,company_name,t12_volume,industry_name,number_of_locations,median_income,total_pop,volume_growth
0,2175.0,0.375499,STEELES WAREHOUSE,74607.25,Appliances,Medium-sized Retailer,48913.0,33685.0,0.269397
1,1270.0,0.294452,"DERANLEAUS OF LEWISTON, INC.",64195.04,Appliances,Medium-sized Retailer,57362.0,35183.0,0.091567
2,2100.0,0.392351,"DANS TV & APPLIANCE, INC.",67781.44,Appliances,Single Retailer,64157.0,14929.0,0.070719
3,50464.0,0.115588,Wilson and Sons Investments LLC.,57340.6,Appliances,Single Retailer,98542.0,19171.0,-0.013344
4,2190.0,0.181231,"HAMAI APPLIANCE, INC.",48215.7,Appliances,Single Retailer,81129.0,28240.0,-0.019756
