In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings(action="ignore")

import os


In [10]:
class ScoreCalulator():
  
    def __init__(self, path, NUM_SECTORS= 10, NUM_EXPENSES=6):
        self.NUM_SECTORS = NUM_SECTORS
        self.NUM_EXPENSES = NUM_EXPENSES
        self.data= pd.read_csv(path)
        DEFAULT_SCORE = 100
        if 'SCORE' not in self.data.columns :
            self.data['SCORE'] = [DEFAULT_SCORE]*len(self.data)
            self.data.to_csv(path)
            print("Score column added")
        try :
            self.SCORE_DATA  = pd.read_csv('ScoreData.csv')
        except:
            self.SCORE_DATA = self.CreateScoretable()


    def CreateScoretable(self):
        SCORE_DATA = pd.DataFrame()
        SCORE_DATA['CUST_ID'] = self.data['CUST_ID']
        SCORE_DATA[['SEC_WT' + str(i) for i in range(self.NUM_SECTORS)] ]  = np.random.randint(0,100, size = (len(self.data), self.NUM_SECTORS))
        SCORE_DATA[['SEC_RAT' + str(i) for i in range(self.NUM_SECTORS)] ]  = np.random.uniform(0,1, size = (len(self.data), self.NUM_SECTORS))
        SCORE_DATA[['EXP_WT' + str(i) for i in range(self.NUM_EXPENSES)] ]  = np.random.randint(0,100, size = (len(self.data), self.NUM_EXPENSES))
        SCORE_DATA[['EXP_RAT' + str(i) for i in range(self.NUM_EXPENSES)] ]  = np.random.uniform(0,1, size = (len(self.data), self.NUM_EXPENSES))
        return SCORE_DATA

    def Score(self, sec_wt,  sec_rat, exp_w, exp_rat):
        if sec_wt.size != sec_rat.size or exp_w.size != exp_rat.size:
            raise "Dimensions not matcing"
        return np.sum(sec_wt * sec_rat) + np.sum(exp_w*exp_rat)

    def ScoreCustomer(self, CUST_ID):
        row = self.SCORE_DATA[self.SCORE_DATA['CUST_ID'] == CUST_ID] 
        sec_wt = row[['SEC_WT' + str(i) for i in range(self.NUM_SECTORS)] ].values
        sec_rat = row[['SEC_RAT' + str(i) for i in range(self.NUM_SECTORS)]].values
        exp_w  = row[['EXP_WT' + str(i) for i in range(self.NUM_EXPENSES)] ].values
        exp_rat = row[['EXP_RAT' + str(i) for i in range(self.NUM_EXPENSES)]].values
        return self.Score(sec_wt,  sec_rat, exp_w, exp_rat)
    
    def CalculateAllScores(self):
        self.data['SCORE'] = self.data['CUST_ID'].map(self.ScoreCustomer)
        MAX = np.max(self.data['SCORE'])
        self.data['SCORE']  = self.data['SCORE']/MAX*100
        self.data.drop(['Unnamed: 0'], axis = 1, inplace = True)
        return self.data
        
    def WriteToFile(self):
        self.SCORE_DATA.to_csv('ScoreData.csv')
    
    def GetScoreDataFrame(self):
        return self.SCORE_DATA

    def preprocess(self, normalize=True, save=None):
        columns=['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT',
            'PAYMENTS', 'MINIMUM_PAYMENTS']
        for c in columns:
            Range=c+'_RANGE'
            self.data[Range]=0        
            self.data.loc[((self.data[c]>0)&(self.data[c]<=500)),Range]=1
            self.data.loc[((self.data[c]>500)&(self.data[c]<=1000)),Range]=2
            self.data.loc[((self.data[c]>1000)&(self.data[c]<=3000)),Range]=3
            self.data.loc[((self.data[c]>3000)&(self.data[c]<=5000)),Range]=4
            self.data.loc[((self.data[c]>5000)&(self.data[c]<=10000)),Range]=5
            self.data.loc[((self.data[c]>10000)),Range]=6
        columns=['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 
            'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']
        for c in columns:  
            Range=c+'_RANGE'
            self.data[Range]=0
            self.data.loc[((self.data[c]>0)&(self.data[c]<=0.1)),Range]=1
            self.data.loc[((self.data[c]>0.1)&(self.data[c]<=0.2)),Range]=2
            self.data.loc[((self.data[c]>0.2)&(self.data[c]<=0.3)),Range]=3
            self.data.loc[((self.data[c]>0.3)&(self.data[c]<=0.4)),Range]=4
            self.data.loc[((self.data[c]>0.4)&(self.data[c]<=0.5)),Range]=5
            self.data.loc[((self.data[c]>0.5)&(self.data[c]<=0.6)),Range]=6
            self.data.loc[((self.data[c]>0.6)&(self.data[c]<=0.7)),Range]=7
            self.data.loc[((self.data[c]>0.7)&(self.data[c]<=0.8)),Range]=8
            self.data.loc[((self.data[c]>0.8)&(self.data[c]<=0.9)),Range]=9
            self.data.loc[((self.data[c]>0.9)&(self.data[c]<=1.0)),Range]=10
        columns=['PURCHASES_TRX', 'CASH_ADVANCE_TRX']  
        for c in columns:  
            Range=c+'_RANGE'
            self.data[Range]=0
            self.data.loc[((self.data[c]>0)&(self.data[c]<=5)),Range]=1
            self.data.loc[((self.data[c]>5)&(self.data[c]<=10)),Range]=2
            self.data.loc[((self.data[c]>10)&(self.data[c]<=15)),Range]=3
            self.data.loc[((self.data[c]>15)&(self.data[c]<=20)),Range]=4
            self.data.loc[((self.data[c]>20)&(self.data[c]<=30)),Range]=5
            self.data.loc[((self.data[c]>30)&(self.data[c]<=50)),Range]=6
            self.data.loc[((self.data[c]>50)&(self.data[c]<=100)),Range]=7
            self.data.loc[((self.data[c]>100)),Range]=8
        columns=['SCORE']  
        for c in columns:
            Range=c+'_RANGE'
            self.data[Range]=0
            self.data.loc[((self.data[c]>0)&(self.data[c]<=10)),Range]=1
            self.data.loc[((self.data[c]>10)&(self.data[c]<=20)),Range]=2
            self.data.loc[((self.data[c]>20)&(self.data[c]<=30)),Range]=3
            self.data.loc[((self.data[c]>30)&(self.data[c]<=40)),Range]=4
            self.data.loc[((self.data[c]>40)&(self.data[c]<=50)),Range]=5
            self.data.loc[((self.data[c]>50)&(self.data[c]<=60)),Range]=6
            self.data.loc[((self.data[c]>60)&(self.data[c]<=70)),Range]=7
            self.data.loc[((self.data[c]>70)&(self.data[c]<=80)),Range]=8
            self.data.loc[((self.data[c]>80)&(self.data[c]<=90)),Range]=9
            self.data.loc[((self.data[c]>90)&(self.data[c]<=100)),Range]=10
        self.data.drop(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
          'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
          'PURCHASES_FREQUENCY',  'ONEOFF_PURCHASES_FREQUENCY',
          'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
          'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
          'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'SCORE' ], axis=1, inplace=True)
        if save is not None:
          self.data.to_csv(save+'processed_data_with_score.csv', index=False)
        self.data.drop(['CUST_ID'], axis=1, inplace=True)
        self.X= np.asarray(self.data)
        if normalize==True:
          scale = StandardScaler()
          self.X = scale.fit_transform(self.X)



---



---


# How to Use:


---



---



In [11]:
obj = ScoreCalulator(path = "/content/data-clustering-rackathon.csv")

In [12]:
obj.CalculateAllScores()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,SCORE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12,61.549193
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12,45.331985
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12,37.922138
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12,47.629289
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12,66.974884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6,32.500334
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6,44.582356
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6,60.312225
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6,59.811457


In [13]:
obj.preprocess(save='/content/')

In [14]:
obj.data

Unnamed: 0,TENURE,BALANCE_RANGE,PURCHASES_RANGE,ONEOFF_PURCHASES_RANGE,INSTALLMENTS_PURCHASES_RANGE,CASH_ADVANCE_RANGE,CREDIT_LIMIT_RANGE,PAYMENTS_RANGE,MINIMUM_PAYMENTS_RANGE,BALANCE_FREQUENCY_RANGE,PURCHASES_FREQUENCY_RANGE,ONEOFF_PURCHASES_FREQUENCY_RANGE,PURCHASES_INSTALLMENTS_FREQUENCY_RANGE,CASH_ADVANCE_FREQUENCY_RANGE,PRC_FULL_PAYMENT_RANGE,PURCHASES_TRX_RANGE,CASH_ADVANCE_TRX_RANGE,SCORE_RANGE
0,12,1,1,0,1,0,2,1,1,9,2,0,1,0,0,1,0,7
1,12,4,0,0,0,5,5,4,3,10,0,0,0,3,3,0,1,5
2,12,3,2,2,0,0,5,2,2,10,10,10,0,0,0,3,0,4
3,12,3,3,3,0,1,5,0,0,7,1,1,0,1,0,1,1,5
4,12,2,1,1,0,0,3,2,1,10,1,1,0,0,0,1,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,6,1,1,0,1,0,2,1,1,10,10,0,9,0,5,2,0,4
8946,6,1,1,0,1,0,2,1,0,10,10,0,9,0,0,2,0,5
8947,6,1,1,0,1,0,2,1,1,9,9,0,7,0,3,1,0,7
8948,6,1,0,0,0,1,1,1,1,9,0,0,0,2,3,0,1,6
