In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
df_train = pd.read_csv('../data/raw/train_go05W65.csv',
                       converters={'Product_Holding_B1':ast.literal_eval, 'Product_Holding_B2':ast.literal_eval})

In [3]:
df_test = pd.read_csv('../data/raw/test_VkM91FT.csv',
                       converters={'Product_Holding_B1':ast.literal_eval})

In [4]:
df_train['is_train']=1
df_test['is_train'] = 0

In [5]:
df = pd.concat([df_train, df_test])

In [6]:
df.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,is_train
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],1
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],1
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],1
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6],1
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]",1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58075 entries, 0 to 20326
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer_ID         58075 non-null  object
 1   Gender              58075 non-null  object
 2   Age                 58075 non-null  int64 
 3   Vintage             58075 non-null  int64 
 4   Is_Active           58075 non-null  int64 
 5   City_Category       58075 non-null  object
 6   Customer_Category   58075 non-null  object
 7   Product_Holding_B1  58075 non-null  object
 8   Product_Holding_B2  37748 non-null  object
 9   is_train            58075 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 4.9+ MB


In [8]:
df.describe()

Unnamed: 0,Age,Vintage,Is_Active,is_train
count,58075.0,58075.0,58075.0,58075.0
mean,38.460146,19.585674,0.264572,0.649987
std,10.11105,10.252426,0.441109,0.476978
min,24.0,2.0,0.0,0.0
25%,29.0,13.0,0.0,0.0
50%,37.0,16.0,0.0,1.0
75%,47.0,23.0,1.0,1.0
max,59.0,80.0,1.0,1.0


In [9]:
def get_unique_values_from_list_col(df, col):
    local_df=df[col]
    products = []
    for item in local_df:
        unique_cols = np.unique(item)
        for col in unique_cols:
            if col not in products:
                products.append(col)
    return products

In [10]:
def apply_transformation(row, col, prod_list, prefix):
    if row['is_train']==0 and prefix=='B2_':
        return row
    
    for item in prod_list:
        if item in row[col]:
            row[prefix+item] = 1
    return row

In [11]:
prod_list = get_unique_values_from_list_col(df, 'Product_Holding_B1')
b1_prod_list = ['B1_'+item for item in prod_list]
for item in b1_prod_list:
    df[item]= 0
    
df = df.apply(apply_transformation, col='Product_Holding_B1',prod_list=prod_list, prefix='B1_',axis=1)

In [12]:
prod_list_b2 = get_unique_values_from_list_col(df, 'Product_Holding_B2')
prod_list_b2 = [item for item in prod_list_b2 if str(item)!='nan']
b2_prod_list = ['B2_'+item for item in prod_list_b2 ]
 
for item in b2_prod_list:
    df[item]= 0 

In [13]:
df.columns

Index(['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2',
       'is_train', 'B1_P16', 'B1_P13', 'B1_P20', 'B1_P11', 'B1_P8', 'B1_P17',
       'B1_P21', 'B1_P12', 'B1_P10', 'B1_P19', 'B1_P2', 'B1_P00', 'B1_P18',
       'B1_P15', 'B1_P6', 'B1_P9', 'B1_P7', 'B1_P3', 'B1_P5', 'B1_P4', 'B1_P1',
       'B1_P14', 'B2_P8', 'B2_P3', 'B2_P00', 'B2_P6', 'B2_P12', 'B2_P16',
       'B2_P1', 'B2_P9', 'B2_P10', 'B2_P13', 'B2_P4', 'B2_P5', 'B2_P7',
       'B2_P11', 'B2_P2', 'B2_P15', 'B2_P17', 'B2_P14', 'B2_P20', 'B2_P18'],
      dtype='object')

In [14]:
df = df.apply(apply_transformation, col='Product_Holding_B2',prod_list=prod_list_b2, prefix='B2_', axis=1)

In [15]:
df.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,is_train,...,B2_P4,B2_P5,B2_P7,B2_P11,B2_P2,B2_P15,B2_P17,B2_P14,B2_P20,B2_P18
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],1,...,0,0,0,0,0,0,0,0,0,0
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],1,...,0,0,0,0,0,0,0,0,0,0
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],1,...,0,0,0,0,0,0,0,0,0,0
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6],1,...,0,0,0,0,0,0,0,0,0,0
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]",1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#prod_list_b2 = get_unique_values_from_list_col(df, 'Product_Holding_B2')
# b2_prod_list = ['B2_'+item for item in prod_list_b2]
# for item in b2_prod_list:
#     df_train[item]= 0 
    


In [17]:
df

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,is_train,...,B2_P4,B2_P5,B2_P7,B2_P11,B2_P2,B2_P15,B2_P17,B2_P14,B2_P20,B2_P18
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],1,...,0,0,0,0,0,0,0,0,0,0
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],1,...,0,0,0,0,0,0,0,0,0,0
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],1,...,0,0,0,0,0,0,0,0,0,0
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6],1,...,0,0,0,0,0,0,0,0,0,0
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]",1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20322,CC303542,Female,37,11,1,C2,S3,[P16],,0,...,0,0,0,0,0,0,0,0,0,0
20323,CC266713,Male,29,14,0,C2,S2,[P16],,0,...,0,0,0,0,0,0,0,0,0,0
20324,CC393639,Male,26,12,0,C2,S3,[P13],,0,...,0,0,0,0,0,0,0,0,0,0
20325,CC285013,Female,44,19,1,C1,S3,"[P8, P9, P16]",,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df['current_holdings'] = df['Product_Holding_B1'].apply(lambda x: len(x))

In [19]:
df = df.drop(columns = ['Product_Holding_B1','Product_Holding_B2'])

In [21]:
df[df['is_train'] == 1].to_csv('../data/processed/train_data.csv', index=False)
df[df['is_train'] == 0].to_csv('../data/processed/test_data.csv', index=False)

In [22]:
df.describe(())

Unnamed: 0,Age,Vintage,Is_Active,is_train,B1_P16,B1_P13,B1_P20,B1_P11,B1_P8,B1_P17,...,B2_P5,B2_P7,B2_P11,B2_P2,B2_P15,B2_P17,B2_P14,B2_P20,B2_P18,current_holdings
count,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,...,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0,58075.0
mean,38.460146,19.585674,0.264572,0.649987,0.298356,0.448937,0.098166,0.050573,0.071459,0.189944,...,0.020852,0.043495,0.003203,0.003547,0.000706,0.000293,0.000344,6.9e-05,3.4e-05,1.686044
std,10.11105,10.252426,0.441109,0.476978,0.45754,0.49739,0.297542,0.219125,0.257593,0.39226,...,0.142891,0.203971,0.056503,0.059453,0.026561,0.017107,0.018555,0.008299,0.005868,0.878826
min,24.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,37.0,16.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,59.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0
