In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import numpy as np

In [3]:
# Load the data
file_path = '/content/BUS_PROFILE_DATA.csv'
data = pd.read_csv(file_path)

In [4]:
# Display the first few rows of the data
print("Data Preview:")
print(data.head())


Data Preview:
                            BUS_NAME                   BUS_TYPE        CITY  \
0                      Alto Palo Inc                  Taco Bell      Aurora   
1  Columbia Basin Health Association      Wahluke Family Clinic     Mattawa   
2      Computer Sciences Corporation                        NaN   Camarillo   
3      Northern States Power Company                XCEL Energy  Saint Paul   
4                 Ufp Riverside, LLC  Universal Forest Products   Riverside   

  STATE_CD COUNTRY_NAME  EMPLOYEES_WORKING_HERE  EMPLOYEES_TOTAL  \
0       CO          USA                       1                0   
1       WA          USA                      45                0   
2       CA          USA                       3                0   
3       MN          USA                       1                0   
4       CA          USA                      42               42   

   NO_OF_FMLY_MMBRS  YEAR_ESTABLISHED CURRENCY_CODE  ... DPV_CONFIRM  \
0                77           

In [5]:
# Preview the available business names
print("\nAvailable Business Names:")
print(data['BUS_NAME'].unique())


Available Business Names:
['Alto Palo Inc' 'Columbia Basin Health Association'
 'Computer Sciences Corporation' 'Northern States Power Company'
 'Ufp Riverside, LLC' 'Expeditors International of Washington, Inc.'
 'Keybank National Association' 'Trusteed Plans Service Corporation'
 'Kttc Television Inc' 'Travel Guide, LLC'
 'The Jones Financial Companies L L L P'
 'Culligan Soft Water Service Company' 'Firstbank' 'Big 5 Corp.' 'Subway'
 'The Preserve At Bal Harbor Condo Assoc Inc'
 'Diamond Shamrock Refining and Marketing Company'
 'AFLAC Florida East Market Office' 'Spooner Laveen, P.C.' 'Paul Sweezey'
 "Arizona's Children Association" 'Miracle-Ear'
 'Rock Creek Veterinary Hospital' 'Colorado Free University, Inc.'
 'University Vision Clinic, Inc.' 'Courtyard Denver Stapleton'
 'Markay Cabinets, Inc.' 'Century Builders, LLC' 'T J R Inc'
 'Melvin Concrete, Inc.' 'John C Kois DMD Msd Inc' 'Penney Opco LLC'
 "Griffin's Dry Cleaning & Laundry, Inc."
 'Discount Tire Company of Colorado, I

In [6]:
# Selecting the required columns
required_columns = ['BUS_NAME', 'BUS_EST_NETWORK_SPEND', 'BUS_EST_SECURITY_SPEND', 'BUS_EST_VOICE_SPEND', 'BUS_EST_TOT_SPEND']
data = data[required_columns]

In [7]:
# Handling missing values (if any)
data.fillna(0, inplace=True)


In [8]:
# Select relevant columns for segmentation
spend_columns = [
    'BUS_EST_NETWORK_SPEND',
    'BUS_EST_SECURITY_SPEND',
    'BUS_EST_VOICE_SPEND',
    'BUS_EST_TOT_SPEND'
]

In [9]:
# Filter out rows with missing values in the selected columns
data = data.dropna(subset=spend_columns)

In [10]:
# Normalize the data for clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[spend_columns])

In [11]:
# Apply K-Means clustering
optimal_clusters = 2
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_data)

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
# Apply K-Means clustering
optimal_clusters = 2
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_data)

  super()._check_params_vs_input(X, default_n_init=10)


In [13]:
# Define the threshold for high-value customers (75th percentile)
network_threshold = data['BUS_EST_NETWORK_SPEND'].quantile(0.75)
security_threshold = data['BUS_EST_SECURITY_SPEND'].quantile(0.75)
voice_threshold = data['BUS_EST_VOICE_SPEND'].quantile(0.75)
tot_spend_threshold = data['BUS_EST_TOT_SPEND'].quantile(0.75)

In [14]:
# Function to classify customers as High-Value or Low-Value
def classify_customer(row):
    if (row['BUS_EST_NETWORK_SPEND'] >= network_threshold or
        row['BUS_EST_SECURITY_SPEND'] >= security_threshold or
        row['BUS_EST_VOICE_SPEND'] >= voice_threshold or
        row['BUS_EST_TOT_SPEND'] >= tot_spend_threshold):
        return 'High-Value'
    else:
        return 'Low-Value'

In [15]:
# Apply the classification
data['Customer_Segment'] = data.apply(classify_customer, axis=1)

# Filter only high-value customers
high_value_customers = data[data['Customer_Segment'] == 'High-Value']

# Normalize data for high-value customers
scaled_high_value_data = scaler.transform(high_value_customers[spend_columns])

# Calculate cosine similarity
cosine_sim = cosine_similarity(scaled_high_value_data)

# Recommend based on cosine similarity - Get top 3 similar customers for each high-value customer
def get_top_similar_customers(customer_index, sim_matrix, top_n=3):
    # Use iloc to get the row based on the customer index in the high_value_customers dataframe
    sim_scores = list(enumerate(sim_matrix[high_value_customers.index.get_loc(customer_index)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Exclude itself (index 0) and get the top N similar customers
    top_customers = sim_scores[1:top_n+1]
    return top_customers

# Generate recommendations for each high-value customer
# Use the reset_index to get a new index for high_value_customers
high_value_customers = high_value_customers.reset_index()
high_value_customers['Recommendations'] = high_value_customers.index.map(
    lambda idx: [x[0] for x in get_top_similar_customers(idx, cosine_sim)]
)

# Map recommended customer indices to clusters
high_value_customers['Recommended_Clusters'] = high_value_customers['Recommendations'].apply(
    lambda recs: [high_value_customers.iloc[rec]['Cluster'] for rec in recs]
)

# Display the high-value customers with recommendations and their corresponding clusters
print(high_value_customers[['BUS_EST_TOT_SPEND', 'Cluster', 'Recommendations', 'Recommended_Clusters']])

     BUS_EST_TOT_SPEND  Cluster Recommendations Recommended_Clusters
0             221415.0        1    [107, 7, 52]            [1, 1, 1]
1             720723.0        1   [38, 122, 43]            [1, 1, 1]
2             430264.0        1  [58, 135, 123]            [1, 1, 1]
3            1847571.0        1    [16, 62, 36]            [1, 1, 1]
4             236948.0        1   [108, 30, 70]            [1, 1, 1]
..                 ...      ...             ...                  ...
141           611515.0        1   [26, 27, 141]            [1, 1, 1]
142          1189295.0        1   [142, 83, 57]            [1, 1, 1]
143            77003.0        1   [12, 132, 28]            [1, 1, 1]
144          8658488.0        0  [91, 130, 144]            [0, 0, 0]
145           306647.0        1    [5, 137, 29]            [1, 1, 1]

[146 rows x 4 columns]


In [16]:
# Save the recommendations to a CSV file
high_value_customers[['BUS_EST_TOT_SPEND', 'Cluster', 'Recommendations', 'Recommended_Clusters']].to_csv('/content/high_value_recommendations.csv', index=False)

# Provide a link to download the CSV file
from google.colab import files
files.download('/content/high_value_recommendations.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
# Apply the classification
data['Customer_Segment'] = data.apply(classify_customer, axis=1)

# Filter only high-value customers
high_value_customers = data[data['Customer_Segment'] == 'High-Value']

# Normalize data for high-value customers
scaled_high_value_data = scaler.transform(high_value_customers[spend_columns])

# Calculate cosine similarity
cosine_sim = cosine_similarity(scaled_high_value_data)

# Recommend based on cosine similarity - Get top 3 similar customers for each high-value customer
def get_top_similar_customers(customer_index, sim_matrix, top_n=3):
    # Use iloc to get the row based on the customer index in the high_value_customers dataframe
    sim_scores = list(enumerate(sim_matrix[high_value_customers.index.get_loc(customer_index)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Exclude itself (index 0) and get the top N similar customers
    top_customers = sim_scores[1:top_n+1]
    return [high_value_customers.iloc[x[0]]['BUS_NAME'] for x in top_customers] # Return BUS_NAME for recommended customers

# Generate recommendations for each high-value customer
# Use the reset_index to get a new index for high_value_customers
high_value_customers = high_value_customers.reset_index()
high_value_customers['Recommendations'] = high_value_customers.index.map(
    lambda idx: [x for x in get_top_similar_customers(idx, cosine_sim)]
)


# Display the high-value customers with recommendations and their corresponding clusters
print(high_value_customers[['BUS_NAME','BUS_EST_TOT_SPEND', 'Cluster', 'Recommendations']])

                                         BUS_NAME  BUS_EST_TOT_SPEND  Cluster  \
0                   Computer Sciences Corporation           221415.0        1   
1                              Ufp Riverside, LLC           720723.0        1   
2    Expeditors International of Washington, Inc.           430264.0        1   
3                    Keybank National Association          1847571.0        1   
4                             Kttc Television Inc           236948.0        1   
..                                            ...                ...      ...   
141                          Stater Bros. Markets           611515.0        1   
142                                     Aldi Inc.          1189295.0        1   
143               Southwest Medical Imaging, Ltd.            77003.0        1   
144                                  Walmart Inc.          8658488.0        0   
145                   Independent School Dist 625           306647.0        1   

                           

In [18]:
# Apply the classification
data['Customer_Segment'] = data.apply(classify_customer, axis=1)

# Filter only high-value customers
high_value_customers = data[data['Customer_Segment'] == 'High-Value']

# Normalize data for high-value customers
scaled_high_value_data = scaler.transform(high_value_customers[spend_columns])

# Calculate cosine similarity
cosine_sim = cosine_similarity(scaled_high_value_data)

# Recommend based on cosine similarity - Get top 3 similar customers for each high-value customer
def get_top_similar_customers(customer_index, sim_matrix, top_n=3):
    # Use iloc to get the row based on the customer index in the high_value_customers dataframe
    sim_scores = list(enumerate(sim_matrix[high_value_customers.index.get_loc(customer_index)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Exclude itself (index 0) and get the top N similar customers
    top_customers = sim_scores[1:top_n+1]
    return [high_value_customers.iloc[x[0]]['BUS_NAME'] for x in top_customers] # Return BUS_NAME for recommended customers

# Generate recommendations for each high-value customer
# Use the reset_index to get a new index for high_value_customers
high_value_customers = high_value_customers.reset_index()
high_value_customers['Recommendations'] = high_value_customers.index.map(
    lambda idx: [x for x in get_top_similar_customers(idx, cosine_sim)]
)


# Display the high-value customers with recommendations and their corresponding clusters
print(high_value_customers[['BUS_NAME','BUS_EST_TOT_SPEND', 'Cluster', 'Recommendations']])

# Save recommendations to a CSV file
high_value_customers[['BUS_NAME','BUS_EST_TOT_SPEND', 'Cluster', 'Recommendations']].to_csv('recommendations.csv', index=False)
print("Recommendations saved to recommendations.csv")

                                         BUS_NAME  BUS_EST_TOT_SPEND  Cluster  \
0                   Computer Sciences Corporation           221415.0        1   
1                              Ufp Riverside, LLC           720723.0        1   
2    Expeditors International of Washington, Inc.           430264.0        1   
3                    Keybank National Association          1847571.0        1   
4                             Kttc Television Inc           236948.0        1   
..                                            ...                ...      ...   
141                          Stater Bros. Markets           611515.0        1   
142                                     Aldi Inc.          1189295.0        1   
143               Southwest Medical Imaging, Ltd.            77003.0        1   
144                                  Walmart Inc.          8658488.0        0   
145                   Independent School Dist 625           306647.0        1   

                           

In [None]:
from google.colab import files
files.download('high_value_recommendations.csv') # Changed filename to  'high_value_recommendations.csv'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>