In [1]:
import pandas as pd
import numpy as np
import joblib
import sklearn
# import ensemble
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error,balanced_accuracy_score,recall_score
import random
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb

In [2]:
def incr_act_top10(input_df,pred_col,cm_key='customer',treated_col='ind_recommended',actual_col='activation'):
    
	#for correcting variable types
    input_df[[treated_col, actual_col, pred_col]] = input_df[[treated_col, actual_col, pred_col]].apply(pd.to_numeric, errors='coerce')
	
    input_df['rank_per_cm1'] = input_df.groupby(cm_key)[pred_col].rank(method='first', ascending=False)
    
    input_df = input_df.loc[input_df.rank_per_cm1 <= 10,:]
    
    agg_df = input_df.groupby(treated_col,as_index=False).agg({actual_col:'mean'})
    agg_df.columns = [treated_col,'avg_30d_act']
    
    print(agg_df)
    recommended_avg_30d_act = float(agg_df.loc[agg_df[treated_col]==1,'avg_30d_act'])
    not_recommended_avg_30d_act = float(agg_df.loc[agg_df[treated_col]==0,'avg_30d_act'])
    
    return (recommended_avg_30d_act-not_recommended_avg_30d_act)


def eval(df_input, df_round): 
    # round off scores to 10 decimal points
    df_input['predicted_score'] = df_input['predicted_score'].round(10)


    # groupby customer, merchant and max score
    df_input = df_input.groupby(['customer', 'merchant'], as_index = False)['predicted_score'].agg('max')


    # merging predicted file and dependent variable file
    eval_data = pd.merge(df_round,df_input,on=['customer','merchant'],how='inner').drop_duplicates()
    # deleting the rows having null value in predicted_score
    eval_data = eval_data[~(eval_data['predicted_score'].isna())]


    if df_round.shape[0] != eval_data.shape[0]:
        return('Error: Rows are missing in the output file')
    else:
        print('Input Files are Correct')

    final_score = round(incr_act_top10(input_df=eval_data,pred_col='predicted_score',cm_key='customer',treated_col='ind_recommended',actual_col='activation'), 7)

    print('Incremental Activation Rate for Top 10 ranked Merchants(dataset level): ', final_score)

In [3]:
# data = pd.read_parquet("Data/Training/Amex_Campus_Challenge_Train.parquet")
data = pd.read_csv("Data/Training/bilkulfirstaayenge.csv")


In [30]:
randomlist = random.sample(list(data.customer.unique()),int(0.1*len(data.customer.unique())))

In [31]:
testdata = data[data['customer'].isin(randomlist)]

In [4]:
#select rows whose column value is not in a list
# traindata = data[~data['customer'].isin(randomlist)]
traindata = data

datarec = traindata[traindata['ind_recommended']==1].drop(['ind_recommended'], axis=1)
datanotrec = traindata[traindata['ind_recommended']==0].drop(['ind_recommended'], axis=1)

In [5]:
def confusion_matrix_2col(df, col1, col2):
    return df.groupby([col1, col2]).size().unstack(fill_value=0)

confusion_matrix_2col(traindata, 'ind_recommended', 'activation')

activation,0,1
ind_recommended,Unnamed: 1_level_1,Unnamed: 2_level_1
0,292879,60173
1,80162,9843


In [8]:
traindata = traindata.drop(traindata[traindata['ind_recommended'] == 1].index)
traindata.groupby('activation').count()

Unnamed: 0_level_0,ind_recommended,customer_digital_activity_04,customer_spend_01,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_spend_02,customer_spend_03,...,merchant_spend_09,merchant_profile_03,customer_digital_activity_01,merchant_spend_10,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,customer,merchant
activation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,292879,40436,288299,232089,232089,232089,232089,232089,288299,291310,...,260851,260851,275631,284012,292079,292833,292833,292879,292879,292879
1,60173,24111,59059,53304,53304,53304,53304,53304,59059,59817,...,59964,59964,56997,60133,60016,60159,60165,60173,60173,60173


In [10]:
traindata = traindata.drop(traindata[traindata['activation'] == 0].sample(n=192879).index)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(traindata.drop(['customer','merchant','activation','ind_recommended'],axis=1), traindata['activation'], test_size=0.1, random_state=1)
act = cbt.CatBoostClassifier()

In [12]:
act.fit(X_train, y_train)
y_pred = act.predict(X_test)

Learning rate set to 0.086051
0:	learn: 0.6542422	total: 82.6ms	remaining: 1m 22s
1:	learn: 0.6218591	total: 99ms	remaining: 49.4s
2:	learn: 0.5972515	total: 115ms	remaining: 38.4s
3:	learn: 0.5781139	total: 133ms	remaining: 33.1s
4:	learn: 0.5600315	total: 151ms	remaining: 30.1s
5:	learn: 0.5460568	total: 172ms	remaining: 28.6s
6:	learn: 0.5350900	total: 189ms	remaining: 26.8s
7:	learn: 0.5254021	total: 207ms	remaining: 25.7s
8:	learn: 0.5155638	total: 229ms	remaining: 25.2s
9:	learn: 0.5088865	total: 250ms	remaining: 24.8s
10:	learn: 0.5017443	total: 269ms	remaining: 24.2s
11:	learn: 0.4963924	total: 289ms	remaining: 23.8s
12:	learn: 0.4919181	total: 305ms	remaining: 23.2s
13:	learn: 0.4870692	total: 322ms	remaining: 22.7s
14:	learn: 0.4834515	total: 339ms	remaining: 22.3s
15:	learn: 0.4798680	total: 354ms	remaining: 21.8s
16:	learn: 0.4771250	total: 378ms	remaining: 21.9s
17:	learn: 0.4751193	total: 439ms	remaining: 24s
18:	learn: 0.4727980	total: 464ms	remaining: 23.9s
19:	learn: 0

In [13]:
print("Accuracy : ",accuracy_score(y_test, y_pred),"Balanced Accuracy :",balanced_accuracy_score(y_test, y_pred)," recall :",recall_score(y_test, y_pred))

Accuracy :  0.7927956049444375 Balanced Accuracy : 0.7748465575914627  recall : 0.7026084067120785


In [14]:
# traindata = data[~data['customer'].isin(randomlist)]
traindata = data

In [15]:
traindata.groupby('ind_recommended').count()

Unnamed: 0_level_0,activation,customer_digital_activity_04,customer_spend_01,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_spend_02,customer_spend_03,...,merchant_spend_09,merchant_profile_03,customer_digital_activity_01,merchant_spend_10,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,customer,merchant
ind_recommended,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,353052,64547,347358,285393,285393,285393,285393,285393,347358,351127,...,320815,320815,332628,344145,352095,352992,352998,353052,353052,353052
1,90005,16048,89006,73345,73345,73345,73345,73345,89006,89639,...,82092,82092,84666,88135,89733,89997,89997,90005,90005,90005


In [16]:
traindata = traindata.drop(traindata[traindata['ind_recommended'] == 0].sample(n=253052).index)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(traindata.drop(['customer','merchant','activation','ind_recommended'],axis=1), traindata['ind_recommended'], test_size=0.1, random_state=1)
rec = cbt.CatBoostClassifier()

In [18]:
rec.fit(X_train, y_train)

Learning rate set to 0.092561
0:	learn: 0.6833605	total: 21.4ms	remaining: 21.4s
1:	learn: 0.6754015	total: 46ms	remaining: 22.9s
2:	learn: 0.6686495	total: 63.4ms	remaining: 21.1s
3:	learn: 0.6630975	total: 82.7ms	remaining: 20.6s
4:	learn: 0.6585494	total: 102ms	remaining: 20.3s
5:	learn: 0.6549631	total: 121ms	remaining: 20s
6:	learn: 0.6517403	total: 139ms	remaining: 19.7s
7:	learn: 0.6490438	total: 157ms	remaining: 19.5s
8:	learn: 0.6469145	total: 176ms	remaining: 19.3s
9:	learn: 0.6449089	total: 194ms	remaining: 19.2s
10:	learn: 0.6433092	total: 216ms	remaining: 19.4s
11:	learn: 0.6417974	total: 240ms	remaining: 19.8s
12:	learn: 0.6406515	total: 264ms	remaining: 20s
13:	learn: 0.6397026	total: 290ms	remaining: 20.4s
14:	learn: 0.6388564	total: 308ms	remaining: 20.2s
15:	learn: 0.6380820	total: 327ms	remaining: 20.1s
16:	learn: 0.6374426	total: 346ms	remaining: 20s
17:	learn: 0.6368124	total: 365ms	remaining: 19.9s
18:	learn: 0.6362238	total: 383ms	remaining: 19.8s
19:	learn: 0.63

<catboost.core.CatBoostClassifier at 0x14a4ffe50>

In [19]:
y_pred = rec.predict(X_test)
print("Accuracy : ",accuracy_score(y_test, y_pred),"Balanced Accuracy :",balanced_accuracy_score(y_test, y_pred)," recall :",recall_score(y_test, y_pred))

Accuracy :  0.6519130572075154 Balanced Accuracy : 0.6453497827774063  recall : 0.5275888975588006


In [47]:
rec_list = rec.predict_proba(testdata.drop(['ind_recommended','activation','customer','merchant'],axis=1))[:,1]
act_list = act.predict_proba(testdata.drop(['ind_recommended','activation','customer','merchant'],axis=1))[:,1]

: 

In [22]:
#add column to df 
testdata['rec'] = list(rec_list)
testdata['act'] = list(act_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['rec'] = list(rec_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['act'] = list(act_list)


In [26]:
#testdata['predicted_score'] = testdata['rec']*(abs(1-testdata['nrec']))
testdata['predicted_score'] = -testdata['act']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['predicted_score'] = -testdata['act']


In [27]:
testdata

Unnamed: 0,ind_recommended,activation,customer_digital_activity_04,customer_spend_01,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_spend_02,...,merchant_spend_10,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,customer,merchant,rec,act,predicted_score
7,0,0,,65.228077,170.970000,1.0,512.91,3.0,2.0,6.0,...,1330.000000,47.374683,3.666667,119.0,3.785088,372340,484511,0.45,0.04,-0.04
17,0,0,9.0,108.040000,24.992000,2.0,124.96,5.0,3.0,1.0,...,23.150000,0.000000,2.333333,3.0,11.724825,427684,95714,0.34,0.77,-0.77
22,0,0,,390.283538,41.841524,81.0,4393.36,105.0,88.0,37.0,...,28.380000,91.933386,20.166667,97.0,2.801655,428538,492233,0.23,0.42,-0.42
30,0,0,,76.628831,132.395455,7.0,2912.70,22.0,19.0,26.0,...,14.000000,76.922488,7.666667,126.0,3.273081,229884,411676,0.71,0.00,-0.00
32,0,0,,74.552833,30.110123,32.0,2438.92,81.0,65.0,7.0,...,34.460000,95.012044,11.333333,106.0,7.161980,298803,239233,0.83,0.17,-0.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443017,0,1,,173.254667,88.592308,2.0,2303.40,26.0,25.0,33.0,...,20.988333,65.319027,13.166667,187.0,23.638537,21172,211457,0.12,0.49,-0.49
443020,1,1,,104.344687,33.189154,58.0,4314.59,130.0,94.0,26.0,...,42.305000,99.984905,1.666667,341.0,1.630419,2253,20263,0.25,0.14,-0.14
443021,0,1,,60.243382,,,,,,25.0,...,17.300000,99.984905,1.666667,341.0,1.497284,2253,138383,0.58,0.33,-0.33
443041,0,1,,57.289091,,,,,,16.0,...,56.960000,71.475632,3.333333,457.0,1.869187,22660,246363,0.11,0.98,-0.98


In [28]:
eval(testdata[['customer','predicted_score','merchant']],testdata[['customer','ind_recommended','activation','merchant']])

Input Files are Correct
   ind_recommended  avg_30d_act
0                0     0.228181
1                1     0.162614
Incremental Activation Rate for Top 10 ranked Merchants(dataset level):  -0.0655675


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input['predicted_score'] = df_input['predicted_score'].round(10)


In [38]:
testdata[['customer','predicted_score','merchant']].to_csv('d_pred.csv',index=False)

In [20]:
name='AAC_newdata'

In [21]:
import os
os.makedirs('submissions/'+name+'/models', exist_ok=True)

In [22]:
joblib.dump(rec, 'submissions/'+name+'/models/rec.joblib')
joblib.dump(act, 'submissions/'+name+'/models/act.joblib')

['submissions/AAC_newdata/models/act.joblib']

In [31]:
testdata.predicted_score.mean()

0.15910713000984675

In [None]:
#form1  (rec.round-0.5)*act

In [None]:
5