In [11]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
df = pd.read_csv('C:/Users/Srijan-DS/Documents/Projects/identify-profitable-customers/data/raw/raw.csv')
df.sample(5)

Unnamed: 0,id,purchase_amount,asset_amount,average_ratio,personal_id_1,personal_id_2,age,area,job_type,phone,personal_card_1,personal_card_2,personal_card_3,personal_card_4,car,purchase_score,campaign_use,card_expired,average_favorite_score,card_history_period,score_1,score_2,score_3,score_4,total_amount_1,total_amount_2,total_amount_3,important_customer
15865,15865,,69460,76.373925,107,48,40,298.0,Salaried,1,1,0.0,0,,0,704,1.0,0.0,1.747237,3.728168,0,7,0,0.0,811,31254,14299,0
6972,6972,51693.0,67397,73.293709,1664,86,44,,Self employed,1,1,0.0,1,0.0,0,689,0.0,0.0,1.0771,1.185579,0,13,0,0.0,7254,175514,1824,0
3458,3458,42615.0,85230,83.561624,2543,45,24,89.0,Self employed,1,1,1.0,0,0.0,0,710,1.0,0.0,0.004308,1.54863,0,8,0,0.0,49863,219007,835,0
16941,16941,,176396,47.273471,375,86,36,1028.0,Salaried,1,0,1.0,1,,0,304,,,1.520321,4.431609,0,1,3,0.0,17154,2848,148,1
13423,13423,54306.0,69227,66.432451,1726,86,37,1800.0,Self employed,1,0,0.0,0,0.0,0,745,0.0,0.0,0.013259,0.049654,0,5,0,0.0,53479,7186,1949,1


In [13]:
# Define the bins and labels
bins = [20, 30, 40, 50]
labels = ['21-31', '31-41', '41-51']

# Create the age_bracket column
df['age_bracket'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

In [14]:
df['age_bracket'].unique()

['41-51', '31-41', '21-31', NaN]
Categories (3, object): ['21-31' < '31-41' < '41-51']

In [15]:
df['age'].describe()

count    17500.000000
mean        35.078171
std          7.794628
min         21.000000
25%         28.000000
50%         36.000000
75%         42.000000
max         50.000000
Name: age, dtype: float64

In [16]:
df['purchase_amount'].fillna(df['purchase_amount'].median(),inplace=True)

In [17]:
df_spending = df[['id','purchase_amount','purchase_score','card_history_period']]

In [18]:
df_spending.sample(5)

Unnamed: 0,id,purchase_amount,purchase_score,card_history_period
2834,2834,73438.0,1,0.21581
2412,2412,61790.0,0,1.032918
14974,14974,54086.0,727,6.321273
7589,7589,40452.0,708,0.024895
10122,10122,46378.0,2,1.391287


In [19]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [20]:
df_spending['purchase_amount'] = sc.fit_transform(df_spending['purchase_amount'].to_numpy().reshape(-1,1))
df_spending['card_history_period'] = sc.fit_transform(df_spending['card_history_period'].to_numpy().reshape(-1,1))
df_spending['purchase_score'] = sc.fit_transform(df_spending['purchase_score'].to_numpy().reshape(-1,1))

In [21]:
df_spending.head()

Unnamed: 0,id,purchase_amount,purchase_score,card_history_period
0,0,1.117862,-1.271183,-0.687181
1,1,1.384057,0.847212,0.045915
2,2,0.755439,-0.342695,0.836423
3,3,0.890981,0.952381,0.772206
4,4,0.248816,-1.262169,-0.678747


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim1 = cosine_similarity(df_spending)

In [23]:
cosine_sim1[0]

array([ 1.00000000e+00,  1.25930685e-01,  1.66327301e-01, ...,
       -8.48054816e-05,  3.94507627e-05, -1.46692940e-05])

In [24]:
cosine_sim1.shape

(17500, 17500)

In [25]:
def recommend_customers(id, cosine_sim):
    idx = df.index[df['id']==id].tolist()[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key = lambda i:i[1], reverse=True)

    sim_scores = sim_scores[1:6]

    customer_index = [i[0] for i in sim_scores]

    recommendations_df = pd.DataFrame(
        {
            'Customer_id' : df['id'].iloc[customer_index],
            'SImilarity_score' : sim_scores
        }
    )

    return recommendations_df

In [26]:
recommend_customers(2, cosine_sim1)

Unnamed: 0,Customer_id,SImilarity_score
3,3,"(3, 0.8962940713586053)"
22,22,"(22, 0.8935743525678641)"
39,39,"(39, 0.8906255517790808)"
75,75,"(75, 0.8880069359793982)"
31,31,"(31, 0.8875893085011687)"


## Demographic Based Filtering

In [27]:
df.head()

Unnamed: 0,id,purchase_amount,asset_amount,average_ratio,personal_id_1,personal_id_2,age,area,job_type,phone,personal_card_1,personal_card_2,personal_card_3,personal_card_4,car,purchase_score,campaign_use,card_expired,average_favorite_score,card_history_period,score_1,score_2,score_3,score_4,total_amount_1,total_amount_2,total_amount_3,important_customer,age_bracket
0,0,62978.0,64016,87.170255,1256,86,44,,Salaried,1,1,,0,1.0,0,0,0.0,0.0,0.010922,0.04834,0,7,0,0.0,58620,10590,1392,0,41-51
1,1,66790.0,64497,83.609102,666,86,30,5318.0,Salaried,1,0,0.0,1,0.0,0,705,0.0,0.0,1.687598,1.701259,0,0,6,0.0,70809,9575,889,0,31-41
2,2,57788.0,111404,58.98618,1621,120,36,5954.0,Self employed,1,1,0.0,0,,0,309,2.0,1.0,0.001265,3.483625,0,9,6,0.0,55294,38319,779,0,31-41
3,3,59729.0,63983,72.702121,1731,51,27,7263.0,,1,0,0.0,0,0.0,0,740,0.0,0.0,0.015441,3.338835,0,1,2,0.0,1972,9314,2081,0,21-31
4,4,50533.0,98623,82.218856,801,86,40,6270.0,,1,1,0.0,0,0.0,0,3,0.0,0.0,0.001283,0.067355,0,2,0,0.0,4942,10484,1772,0,41-51


In [28]:
df_demographic = df[['id','age', 'job_type', 'phone', 'asset_amount', 'car']]

In [29]:
df_demographic['job_type'].fillna(df['job_type'].mode()[0],inplace=True)

In [30]:
df_demographic.sample(5)

Unnamed: 0,id,age,job_type,phone,asset_amount,car
12924,12924,27,Self employed,1,68466,0
3970,3970,31,Salaried,1,74311,0
61,61,37,Salaried,1,73256,0
7438,7438,44,Self employed,1,67864,0
8236,8236,39,Salaried,1,68047,0


In [31]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False,drop='first')

In [32]:
df_demographic['job_type_encoded'] = ohe.fit_transform(df_demographic[['job_type']])

In [33]:
df_demographic.drop(['job_type'],axis=1,inplace=True)

In [34]:
df_demographic.head()

Unnamed: 0,id,age,phone,asset_amount,car,job_type_encoded
0,0,44,1,64016,0,0.0
1,1,30,1,64497,0,0.0
2,2,36,1,111404,0,1.0
3,3,27,1,63983,0,1.0
4,4,40,1,98623,0,1.0


In [35]:
df_demographic['age'] = sc.fit_transform(df_demographic['age'].to_numpy().reshape(-1,1))
df_demographic['asset_amount'] = sc.fit_transform(df_demographic['asset_amount'].to_numpy().reshape(-1,1))
df_demographic['car'] = sc.fit_transform(df_demographic['car'].to_numpy().reshape(-1,1))
df_demographic['job_type_encoded'] = sc.fit_transform(df_demographic['job_type_encoded'].to_numpy().reshape(-1,1))

In [36]:
cosine_sim2 = cosine_similarity(df_demographic)

In [37]:
def recommend_customers(id, cosine_sim):
    idx = df.index[df['id']==id].tolist()[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key = lambda i:i[1], reverse=True)

    sim_scores = sim_scores[1:6]

    customer_index = [i[0] for i in sim_scores]

    recommendations_df = pd.DataFrame(
        {
            'Customer_id' : df['id'].iloc[customer_index],
            'SImilarity_score' : sim_scores
        }
    )

    return recommendations_df

In [38]:
recommend_customers(2, cosine_sim2)

Unnamed: 0,Customer_id,SImilarity_score
4,4,"(4, 0.8979148104945842)"
6,6,"(6, 0.8696021093781783)"
8,8,"(8, 0.788223564502356)"
17,17,"(17, 0.7734229946280602)"
15,15,"(15, 0.7629974180003768)"


## Combined Recommendation

In [39]:
def recommend_customers(id, cosine_sim1, cosine_sim2):
    idx = df.index[df['id']==id].tolist()[0]

    cosine_sim = 6*cosine_sim1 + 0.05*cosine_sim2

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key = lambda i:i[1], reverse=True)

    sim_scores = sim_scores[1:6]

    customer_index = [i[0] for i in sim_scores]

    recommendations_df = pd.DataFrame(
        {
            'Customer_id' : df['id'].iloc[customer_index],
            'SImilarity_score' : sim_scores
        }
    )

    return recommendations_df

In [40]:
recommend_customers(2, cosine_sim1, cosine_sim2)

Unnamed: 0,Customer_id,SImilarity_score
3,3,"(3, 5.4088610241873605)"
22,22,"(22, 5.397270631087289)"
39,39,"(39, 5.3771683619822275)"
75,75,"(75, 5.361521584160104)"
31,31,"(31, 5.359342747205321)"
