In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()


In [None]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as math
import datetime as dt
import sklearn
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import __version__
print(__version__)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('precision', 5)
pd.options.display.float_format = '{:20,.2f}'.format
np.set_printoptions(suppress =True) 

import os
print(os.listdir("../input"))
import tqdm
from tqdm import tqdm


In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv', nrows=50).head(5)

In [None]:
x = pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv', nrows=50).head(5)
x.head()

In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/merchants.csv', nrows=50).head(3)

In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv', nrows=50).head(3)

In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/train.csv', nrows=50).head(3)

In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/test.csv', nrows=50).head(3)

In [None]:
pd.read_csv('../input/elo-merchant-category-recommendation/sample_submission.csv', nrows=50).head(3)

In [None]:
! pip install openpyxl

In [None]:
pd.read_excel('../input/elo-merchant-category-recommendation/Data_Dictionary.xlsx')

### Reading the Data

In [None]:
df_train = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/train.csv"))
df_test = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/test.csv"))
df_historical =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/historical_transactions.csv",parse_dates=['purchase_date']))
df_new =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/new_merchant_transactions.csv",parse_dates=['purchase_date']))

In [None]:
df_historical.head(2)  #historical transactions

In [None]:
len(df_historical.card_id.unique())  # number of unique card IDs

In [None]:
df_new.head(2)   # new merchant transactions

In [None]:
df_historical=df_historical.loc[df_historical.authorized_flag=="Y",]
df_historical.purchase_amount += 0.75
df_new.purchase_amount += 0.75

### Checking for Purchase, Return and Rebate Transactions (*if any)

### As the purchase_amount variable is normalized it's not possible to check if the transaction is a Purchase/Return/Rebate one

### Here we are doing aggregation by card-id (At Customer Level). We Can also compute at Item Level RFM and Store Level RFM or even website visit & activity based RFM etc depending on the availability of data

In [None]:
def groupby_mean(x):                    # defining average transactions
    return x.mean()

def groupby_count(x):                  # defining number of transactions
    return x.count()

def purchase_duration(x):               # defining function purchase_durations as Duration of the purchase or time gap (in days between 2 transactions) 
    return (x.max() - x.min()).days

def avg_frequency(x):
    return (x.max() - x.min()).days/x.count() # defining average frequency of transactions as purchase duration / count

groupby_mean.__name__ = 'avg'
groupby_count.__name__ = 'count'
purchase_duration.__name__ = 'purchase_duration'
avg_frequency.__name__ = 'purchase_frequency'

def get_max(cols):
    return max(cols[0],cols[1])

In [None]:
df_agg_Monetary = df_historical.groupby('card_id').agg({'purchase_amount':sum})   
df_agg_Monetary.columns = ['Monetary']
print(df_agg_Monetary.shape)
df_agg_Monetary.head()

In [None]:
df_agg_Frequency = df_historical.groupby('card_id').agg({'card_id': groupby_count,'purchase_date': groupby_count})
df_agg_Frequency['Frequency'] = df_agg_Frequency[['card_id','purchase_date']].apply(get_max,axis = 1)
print(df_agg_Frequency.shape)
df_agg_Frequency.head()

In [None]:
max_date = max(df_historical['purchase_date'])
print(max_date)

# x = df_historical['purchase_date'][0]
# (max_date-x).days

In [None]:
df_agg_Recency = df_historical.groupby(['card_id']).agg({'purchase_date':max})
df_agg_Recency['Recency'] = df_agg_Recency['purchase_date'].apply(lambda x:(max_date-x).days)
print(df_agg_Recency.shape)
df_agg_Recency.head(10)

In [None]:
df_RFM = pd.merge(pd.merge(df_agg_Recency, df_agg_Frequency, left_index=True, right_index=True), 
                           df_agg_Monetary, left_index=True, right_index=True)
df_RFM.drop(columns=['purchase_date_x', 'card_id', 'purchase_date_y'], inplace=True)
df_RFM.head(5)

In [None]:
plt.figure(figsize=(15,8))
counts, bin_edges = np.histogram(np.log1p(df_RFM['Recency']), bins=10, density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges);
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf);
plt.plot(bin_edges[1:], cdf)
plt.xlabel('log_Recency', fontsize=13)
plt.ylabel('Percentage', fontsize=13)
plt.title('Recency Cumulative Frequency Distribution',fontsize=15)

In [None]:
plt.figure(figsize=(15,8))
counts, bin_edges = np.histogram(np.log1p(df_RFM['Frequency']), bins=10, density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges);
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf);
plt.plot(bin_edges[1:], cdf)
plt.xlabel('log_Frequency', fontsize=12)
plt.ylabel('Percentage', fontsize=12)
plt.title('Frequency Cumulative Frequency Distribution',fontsize=15)

In [None]:
plt.figure(figsize=(15,8))
counts, bin_edges = np.histogram(np.log1p(df_RFM['Monetary']), bins=10, density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges);
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf);
plt.plot(bin_edges[1:], cdf)
plt.xlabel('log_Monetary', fontsize=12)
plt.ylabel('Percentage', fontsize=12)
plt.title('Monetary Cumulative Frequency Distribution',fontsize=15)

In [None]:
plt.close();
sns.set_style("whitegrid");
sns.pairplot(df_RFM, size=4);
plt.show()

## 1. Simple Rule Based Approach - The Business Analyst Way

In [None]:
df_RFM.quantile(q=[0.1,0.25,0.4,0.5,0.75,0.9])

### Thresholds for R | F | M

In [None]:
threshold_M = int(df_RFM.median()['Monetary'])+1
print(threshold_M)
threshold_F = df_RFM.Frequency.quantile(0.75)
print(threshold_F)
threshold_R = df_RFM.Recency.quantile(0.40)
print(threshold_R)

In [None]:
df_RFM['threshold_R'] = df_RFM['Recency'].apply(lambda x: x < threshold_R)
df_RFM['threshold_F'] = df_RFM['Frequency'].apply(lambda x: x > threshold_F)
df_RFM['threshold_M'] = df_RFM['Monetary'].apply(lambda x: x > threshold_M)
df_RFM[['threshold_R', 'threshold_F', 'threshold_M']] = df_RFM[['threshold_R', 'threshold_F', 'threshold_M']].apply(lambda x: x.astype(int), axis=1)
df_RFM['IsLoyal'] = 'NA'
df_RFM['Segment'] = 'NA'
df_RFM.head(10)

In [None]:
def Loyalty_assign(x):
    if((x[5]==1) & (x[4]==1)):
        return 'Loyal'
       
    elif((x[3]==1) & (x[4]==0)):
        return 'Loyal'
         
    else:
        return 'Not Loyal'
    
def Segment_assign(x):
    if((x[5]==1) & (x[3]==1) & (x[4]==1)):
        return 'Champions'
       
    elif((x[5]==1) & (x[3]==1) & (x[4]==0)):
        return 'Future Champions'
         
    elif((x[5]==1) & (x[3]==0) & (x[4]==1)):
        return 'Very Valuable'
         
    elif((x[5]==1) & (x[3]==0) & (x[4]==0)):
        return 'Hibernating'
         
    elif((x[5]==0) & (x[3]==1) & (x[4]==1)):
        return 'Active'
         
    elif((x[5]==0) & (x[3]==1) & (x[4]==0)):
        return 'About to Sleep'
         
    elif((x[5]==0) & (x[3]==0)):
        return 'Lost'

In [None]:
df_RFM['Segment'] = df_RFM.apply(Segment_assign, axis=1)
df_RFM['IsLoyal'] = df_RFM.apply(Loyalty_assign, axis=1)
df_RFM.head()

In [None]:
df_RFM['IsLoyal'].value_counts()

In [None]:
df_RFM['Segment'].value_counts()

In [None]:
df_RFM.reset_index(inplace=True)
df_RFM.head(1)

In [None]:
df_Segment = df_RFM.groupby('Segment', as_index=False).agg({'Monetary':sum, 'card_id':groupby_count, 'Frequency':sum})
df_Segment.columns = ['Segment','Monetary', 'No_Cards', 'Frequency']
df_Segment

In [None]:
df_Loyal = df_RFM.groupby('IsLoyal', as_index=False).agg({'Monetary':sum, 'card_id':groupby_count, 'Frequency':sum})
df_Loyal.columns = ['Loyality','Monetary', 'No_Cards', 'Frequency']
df_Loyal

In [None]:
groups = df_Segment['Segment'].values.tolist()
amount = df_Segment['Monetary'].values.tolist()
#colors = ['red', 'yellow', 'green', 'orange']

trace = go.Pie(labels=groups, values=amount, hoverinfo='label+percent', textinfo='value', textfont=dict(size=25),
       pull=.4,hole=.2,marker=dict(line=dict(color='#000000', width=3)))

iplot([trace])

In [None]:
groups = df_Loyal['Loyality'].values.tolist()
amount = df_Loyal['Monetary'].values.tolist()
#colors = ['red', 'yellow', 'green', 'orange']

trace = go.Pie(labels=groups, values=amount, hoverinfo='label+percent', textinfo='value', textfont=dict(size=25),
       pull=.4,hole=.2,marker=dict(line=dict(color='#000000', width=3)))

iplot([trace])

In [None]:
groups = df_Segment['Segment'].values.tolist()
No_Company = df_Segment['No_Cards'].values.tolist()
Distinct_Frequency = df_Segment['Frequency'].values.tolist()
#colors = ['blue','red', 'yellow', 'pink','violet','green', 'orange']

#trace2 = go.Bar(x=groups,y=No_Company,name='Companies', marker=dict(color=colors))

trace1 = go.Bar(x=groups,y=Distinct_Frequency,name='Frequency')
trace2 = go.Bar(x=groups,y=No_Company,name='Cards/Customers')
data = [trace1, trace2]

layout = go.Layout(barmode='stack')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

In [None]:
groups = df_Loyal['Loyality'].values.tolist()
No_Company = df_Loyal['No_Cards'].values.tolist()
Distinct_Frequency = df_Loyal['Frequency'].values.tolist()
#colors = ['blue','red', 'yellow', 'pink','violet','green', 'orange']

#trace2 = go.Bar(x=groups,y=No_Company,name='Companies', marker=dict(color=colors))

trace1 = go.Bar(x=groups,y=Distinct_Frequency,name='Frequency')
trace2 = go.Bar(x=groups,y=No_Company,name='Cards/Customers')
data = [trace1, trace2]

layout = go.Layout(barmode='stack')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

## 2. Clustering Algorithms Based Segmentation

In [None]:
# df_rfm.set_index('card_id', inplace=True)
# df_rfm.head(1)

In [None]:
rank_df = df_RFM[['Recency','Frequency', 'Monetary']].rank(method='first')
rank_df.head(2)

In [None]:
normalized_df = (rank_df - rank_df.mean()) / rank_df.std()
normalized_df.head(2)

In [None]:
from sklearn.cluster import KMeans
data = normalized_df[['Recency', 'Frequency', 'Monetary']]

sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
    data["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
print(normalized_df.shape)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4,max_iter=1000).fit(normalized_df[['Recency', 'Frequency', 'Monetary']])
print(kmeans.labels_)
print(kmeans.cluster_centers_)

In [None]:
from collections import Counter
z = kmeans.labels_
Counter(z)

In [None]:
normalized_df['Cluster'] = kmeans.labels_
normalized_df.head()

In [None]:
df_kmeans_RFM = df_RFM[['Recency','Frequency', 'Monetary']].copy()
df_kmeans_RFM['Cluster'] = kmeans.labels_
print(df_kmeans_RFM['Cluster'].value_counts())
df_kmeans_RFM.head(5)

In [None]:
colors = ['red', 'yellow', 'green', 'orange']

for i in ['Monetary']:
    trace = go.Pie(labels=df_kmeans_RFM['Cluster'], values=df_kmeans_RFM[i], 
           hoverinfo='label+percent', textinfo='value', textfont=dict(size=15),
           marker=dict(colors=colors, line=dict(color='#000000', width=3)))
    iplot([trace])

In [None]:
df_kmeans_RFM.head(5)

In [None]:
for col in ["Recency","Frequency","Monetary"]:
    print()
    plt.figure(figsize=(14,8))
    ax = sns.boxplot(x="Cluster", y=col, data=normalized_df)
    plt.title('Cluster/Segment wise Difference in Purchase '+col,fontsize=15)
    plt.show()
    print()

# ********** CLV Prediction using RFM**************

For the CLV models, the following components are used:

* Recency - This represents the age of the customer when they made their latest transactions. (Current_date - last_transaction_date)
* Frequency - This represents the total number of transactions/number of visits a customer has made. (Count of total transactions)
* Monetary - This represents the total purchase amount that a specified customer has made. (Sum of purchase_amt)
* Time - This represents the age of the customer. Time span between a customer’s first and last transaction.

In [None]:
import numpy as np 
import pandas as pd

# hist = pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv')

In [None]:
df_historical = df_historical[['card_id','purchase_date','purchase_amount']]
df_historical = df_historical.sort_values(by=['card_id', 'purchase_date'], ascending=[True, True])

In [None]:
df_historical.head()

In [None]:
## Time
from datetime import datetime

z = df_historical.groupby('card_id')['purchase_date'].max().reset_index()
q = df_historical.groupby('card_id')['purchase_date'].min().reset_index()

z.columns = ['card_id', 'Max']
q.columns = ['card_id', 'Min']

## Extracting current timestamp
now = datetime.now()
curr_date = now.strftime("%m-%d-%Y, %H:%M:%S")
curr_date = pd.to_datetime(curr_date)

rec = pd.merge(z,q,how = 'left',on = 'card_id')
rec['Min'] = pd.to_datetime(rec['Min'])
rec['Max'] = pd.to_datetime(rec['Max'])

## Time value 
rec['Recency'] = (curr_date - rec['Max']).astype('timedelta64[D]') ## current date - most recent date

## Recency value
rec['Time'] = (rec['Max'] - rec['Min']).astype('timedelta64[D]') ## Age of customer, MAX - MIN

rec = rec[['card_id','Time','Recency']]
rec.head()

In [None]:
## Frequency
freq = df_historical.groupby('card_id').size().reset_index()
freq.columns = ['card_id', 'Frequency']
freq.head()

In [None]:
## Monetary
mon = df_historical.groupby('card_id')['purchase_amount'].sum().reset_index()
mon.columns = ['card_id', 'Monitary']
mon.head()

In [None]:
final = pd.merge(freq,mon,how = 'left', on = 'card_id')
final = pd.merge(final,rec,how = 'left', on = 'card_id')

final['historic_CLV'] = final['Frequency'] * final['Monitary'] 
final['AOV'] = final['Monitary']/final['Frequency'] ## AOV - Average order value (i.e) total_purchase_amt/total_trans
final['Predictive_CLV'] = final['Time']*final['AOV']*final['Monitary']*final['Recency'] 

final.head()

Above features may boost our model performance

# ****************Some EDA with the Date Variables*************

In [None]:
import warnings
import datetime
import calendar
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import time
from dateutil.relativedelta import relativedelta

# to ignore future warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

# set size of seaborn plots
sns.set(rc={'figure.figsize':(10, 7)})

In [None]:
train = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv', sep = ',')
test = pd.read_csv('../input/elo-merchant-category-recommendation/test.csv', sep = ',')
merchants = pd.read_csv('../input/elo-merchant-category-recommendation/merchants.csv', sep = ',')
new_merchant = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv', sep = ',')

In [None]:
#df_train = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/train.csv"))
#df_test = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/test.csv"))
#df_historical =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/historical_transactions.csv",parse_dates=['purchase_date']))
#df_new =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/new_merchant_transactions.csv",parse_dates=['purchase_date']))

In [None]:
#train = df_train
#test = df_test
#merchants = pd.read_csv('../input/elo-merchant-category-recommendation/merchants.csv', sep = ',')
#new_merchant = 

In [None]:
## prepare data
# this is not a valid approach if you want to build models from the data
# drop some redundant columns
dropping = ['merchant_category_id', 'subsector_id', 'category_1', 'city_id', 'state_id',
            'category_2']
for var in dropping:
    merchants = merchants.drop(var, axis = 1)

# merge merchants with new_merchants
data = pd.merge(merchants, new_merchant, on = 'merchant_id')

# merge data with train data
data = pd.merge(data, train, on = 'card_id')

The variables in question here are first_active_month and purchase_time. Let's take a brief look at their number of unique values and the first five values:

In [None]:
print(len(data['first_active_month'].unique()))
data['first_active_month'][:5]

In [None]:
print(len(data['purchase_date'].unique()))
data['purchase_date'][:5]

In [None]:
#This shows us that the purchase_date variable actually carrys more information than it's name makes it sound like. It's not only the date, but also the specific time. Let's recode this into two variables:

In [None]:
# recode purchase_date
data['purchase_time'] = data['purchase_date'].str.split(' ')
data['purchase_date'] = data['purchase_time'].str[0]
data['purchase_time'] = data['purchase_time'].str[1]

Now there are two different strategies to use these time variables in further models.

Recode them to a linear variable where the lowest number is the day furthest in the past and the highest number the most recent day
Recode them to ordered categorical variables
Let's start with number 1:

In [None]:
def dates_to_numeric(series, kind = 'month'):
    # get all unique values
    months = list(series.unique())

    # sort them
    if kind == 'month':
        date_string = "%Y-%m"
    elif kind == 'day':
        date_string = "%Y-%m-%d"

    # make them a datetime object
    dates = [datetime.datetime.strptime(ts, date_string) for ts in months]
    dates.sort()
    sorteddates = [datetime.datetime.strftime(ts, date_string) for ts in dates]

    # generate all month stamps between first and last
    start_date = sorteddates[0]
    end_date = sorteddates[len(sorteddates) - 1]
    
    cur_date = start = datetime.datetime.strptime(start_date, date_string).date()
    end = datetime.datetime.strptime(end_date, date_string).date()

    months = []
    while cur_date < end:
        if kind == 'month':
            months.append(str(cur_date)[:-3])
            cur_date += relativedelta(months = 1)
        elif kind == 'day':
            months.append(str(cur_date))
            cur_date += relativedelta(days = 1)
    
    # create dict that maps new values to each month
    map_dict = {}
    keys = range(0, len(months))
    for i in keys:
        map_dict[i] = months[i]

    # reverse dict keys / values for mapping
    new_dict = {v: k for k, v in map_dict.items()}
    return new_dict

new_dict = dates_to_numeric(data['first_active_month'])
data['first_active_month_numeric'] = data['first_active_month'].apply(lambda x: new_dict.get(x))

new_dict = dates_to_numeric(data['purchase_date'], kind = 'day')
data['purchase_date_numeric'] = data['purchase_date'].apply(lambda x: new_dict.get(x))

# recode timestamp to number of seconds passed since 00:00:00
def timestamp_to_seconds(time):
    seconds = sum(x * int(t) for x, t in zip([3600, 60, 1], time.split(':'))) 
    return seconds

data['purchase_seconds'] = data['purchase_time'].apply(lambda x: timestamp_to_seconds(x))

In [None]:
ax = sns.regplot(x = data['first_active_month_numeric'], y = data['target'], marker = "+",
                 lowess = True, line_kws = {'color': 'black'})
ax.set_title('Relationship of the target variable and linear first active month')
ax.set_xlabel('first active month linear')

In [None]:
ax = sns.regplot(x = data['purchase_date_numeric'], y = data['target'], marker = "+",
                 lowess = True, line_kws = {'color': 'black'})
ax.set_title('Relationship of the target variable and linear purchase date')
ax.set_xlabel('purchase date linear')

In [None]:
#ax = sns.regplot(x = data['purchase_seconds'], y = data['target'], marker = "+",
                 #lowess = True, line_kws = {'color': 'black'})
#ax.set_xlabel('purchase seconds linear')
# This takes incredibly long and is therefore commented out. It however looks very similar to the plot above

These plot show no relationship between the metric versions of the three date variables and the target variable. By using the lowess smoother instead of a linear regression we also made sure there is no nonlinear relationship between the two variables.

Now that this is out of the way, let's dive into the more reasonable data transformations. For the purchase_date variable we will create a new variable that contains the name of the corresponding weekday (e.g. monday, tuesday ..). For the purchase_time variable we will create a new categorical variable with 4 categories: Morning, Afternoon, Evening and Night. These correspond to:

Morning: 5am to 12pm (05:00 to 11:59)

Afternoon: 12pm to 5pm (12:00 to 16:59)

Evening: 5pm to 9pm (17:00 to 20:59)

Night: 9pm to 5am (21:00 to 04:59)

We will also create two variables containing the corresponding month (e.g. January, February ..) for purchase_date and first_active_month. For the latter we will create a first_active_year variable as well. **Let's start!

EDIT: I forgot that the time of the month itself (beginning, end etc.) may have an effect on the target variable as well. This will be explored too in this second version. We will look both at a categorical version and a numeric version just using the day.

In [None]:
def get_weekday(date_string):
    date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return calendar.day_name[date.weekday()]

# get weekday for date variable
data['purchase_weekday'] = data['purchase_date'].apply(lambda x: get_weekday(x))

# for plotting recode to ordered categorical
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data['purchase_weekday'] = pd.Categorical(data['purchase_weekday'], categories = day_labels, 
                                          ordered = True)

def get_month(date_string, kind = 'month'):
    if kind == 'month':
        date = datetime.datetime.strptime(date_string, '%Y-%m')
    elif kind == 'day':
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return date.strftime("%B")

data['purchase_month'] = data['purchase_date'].apply(lambda x: get_month(x, kind = 'day'))
data['first_active_month2'] = data['first_active_month'].apply(lambda x: get_month(x))
data['first_active_year'] = data['first_active_month'].str[:4]

month_labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
                'September', 'October', 'November', 'December']
data['purchase_month'] = pd.Categorical(data['purchase_month'], categories = month_labels, 
                                          ordered = True)
data['first_active_month2'] = pd.Categorical(data['first_active_month2'], categories = month_labels, 
                                          ordered = True)

year_labels = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
data['first_active_year'] = pd.Categorical(data['first_active_year'], categories = year_labels, 
                                          ordered = True)

# get time of the day
data['temp'] = data['purchase_time'].str.split(':')

def get_session(time_list):
    time_list[0] = int(time_list[0])
    if time_list[0] > 4 and time_list[0] < 12:
        return 'Morning'
    elif time_list[0] >= 12 and time_list[0] < 17:
        return 'Afternoon'
    elif time_list[0] >= 17 and time_list[0] < 21:
        return 'Evening'
    else:
        return 'Night'
    
data['purchase_session'] = data['temp'].apply(lambda x: get_session(x))

session_labels = ['Morning', 'Afternoon', 'Evening', 'Night']
data['purchase_session'] = pd.Categorical(data['purchase_session'], categories = session_labels, 
                                          ordered = True)
## time of month
# as categorical variable, thressholds are arbitrary and could be different
def get_time_of_month_cat(date):
    date_temp = date.split('-')
    if int(date_temp[2]) < 10:
        time_of_month = 'Beginning'
    elif int(date_temp[2]) >= 10 and int(date_temp[2]) < 20:
        time_of_month = 'Middle'
    else:
        time_of_month = 'End'
    return time_of_month

data['time_of_month_cat'] = data['purchase_date'].apply(lambda x: get_time_of_month_cat(x))

tof_labels = ['Beginning', 'Middle', 'End']
data['time_of_month_cat'] = pd.Categorical(data['time_of_month_cat'], categories = tof_labels, 
                                           ordered = True)

data['time_of_month_num'] = data['purchase_date'].str[8:].astype(int)

In [None]:
ax = sns.lineplot(x = "purchase_weekday", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over Purchase Week')
ax.set_xlabel('Purchase Weekday')

There is a pattern here. The target variable follows a non-linear curve over the weekdays. The differences may be small, but they are statistically significant.

In [None]:
ax = sns.lineplot(x = "purchase_month", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over Purchase Month')
ax.set_xlabel('Purchase Month')

Now this is more like it! There are pretty big differences in the mean of the target variable between each month of purchase. In January the mean is close to - 2 while in April it's at - 0,25. Adding the purchase month as dummy variables to your model looks promising.

In [None]:
ax = sns.lineplot(x = "first_active_month2", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over the First Active Month')
ax.set_xlabel('First Active Month')

There are some clear variations in this plot as well, which follow almost a step-wise linear pattern. The differences might not be nearly as strong as in the purchase month plot, but this still might help improve your model.

In [None]:
ax = sns.lineplot(x = "first_active_year", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over the First Active Year')
ax.set_xlabel('First Active Year')

The first active year show some big differences! The target variable increases with each year. Definitly an interesting pattern.

In [None]:
ax = sns.lineplot(x = "purchase_session", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over Purchase Time of Day')
ax.set_xlabel('Purchase Time of Day')

Differences over the day are rather small, but follow a clear pattern as well. Let's see if that pattern holds up if we look at it by different week days:

In [None]:
ax = sns.catplot(x = 'purchase_weekday', y = 'target', hue = 'purchase_session', data = data,
                kind = 'bar', height = 5, aspect = 2)
ax.despine(left = True)
plt.xticks(rotation = 45)
ax.set_ylabels("target")
ax.set_xlabels('Weekday')

It seems like it does. The pattern seems to be nearly the same on all weekdays. There are some differences between saturday and tuesday though.

All in all there are some interesting relationships in the date variables of this data set

In [None]:
ax = sns.regplot(x = data['time_of_month_num'], y = data['target'], marker = "+",
                 lowess = True, line_kws = {'color': 'black'})
ax.set_title('Relationship of the target variable and purchase time of month')
ax.set_xlabel('time of purchase inside month')

Using just the day itself doesn't seem very useful. No direct relationship can be observed. Now let's look at the categorical version:

In [None]:
ax = sns.lineplot(x = "time_of_month_cat", y = "target", 
                  markers = True, dashes = False, data = data)
plt.xticks(rotation = 45)
ax.set_title('Target Variable Changes over Purchase Time of Month')
ax.set_xlabel('Purchase Time of Month')

The pattern is really small with the biggest deviation being between -0.52 and -0.66, but it does exist. I am not sure if this will be particularly useful but it's definitly worth a try. Let's also look if this pattern is the same in each month:

In [None]:
ax = sns.catplot(x = 'purchase_month', y = 'target', hue = 'time_of_month_cat', data = data,
                kind = 'bar', height = 5, aspect = 2)
ax.despine(left = True)
plt.xticks(rotation = 45)
ax.set_ylabels("Target")
ax.set_xlabels('Purchase Time of Month')

The pattern obviously isn't really stable at all. The only consistent finding is, that the end of the month seemingly always has a higher score on the target variable then other parts. If only the very end of the month has an effect on the target variable, we might still be able to utilize this. Let's try by creating a dummy variable that only looks at the very last days of the month:

In [None]:
def get_end_of_month(date):
    date_temp = date.split('-')
    if int(date_temp[2]) >= 25:
        end_of_month = 'Yes'
    else:
        end_of_month = 'No'
    return end_of_month

data['end_of_month'] = data['purchase_date'].apply(lambda x: get_end_of_month(x))

ax = sns.barplot(x = 'end_of_month', y = 'target', data = data)

This shows bigger differences than before.

# ***********Implementing Models *********************

In [None]:
new_transactions = df_new#pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/new_merchant_transactions.csv',
                               #parse_dates=['purchase_date'])

historical_transactions = df_historical#pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/historical_transactions.csv',
                                      #parse_dates=['purchase_date'])

def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df
#_________________________________________
train = df_train#read_data('/kaggle/input/elo-merchant-category-recommendation/train.csv')
test = df_test#read_data('/kaggle/input/elo-merchant-category-recommendation/test.csv')

target = train['target']
print(target.shape)
print(test.shape)

In [None]:
#df_train = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/train.csv"))
#df_test = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/test.csv"))
#df_historical =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/historical_transactions.csv",parse_dates=['purchase_date']))
#df_new =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/new_merchant_transactions.csv",parse_dates=['purchase_date']))

In [None]:
historical_transactions.head()

**Preprocessing**

One Hot Encoding



In [None]:
historical_transactions = pd.get_dummies(df_historical, columns=['category_2', 'category_3'])
new_transactions = pd.get_dummies(new_transactions, columns=['category_2', 'category_3'])

Date processing



In [None]:
for df in [historical_transactions, new_transactions]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((dt.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

Reduce memory usage

In [None]:
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Feature Engineering


helper function to apply aggregations on existing features to create new features



In [None]:
def aggregate_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
    'authorized_flag': ['mean'],
    'category_1': ['sum', 'mean'],
    'category_2_1.0': ['mean'],
    'category_2_2.0': ['mean'],
    'category_2_3.0': ['mean'],
    'category_2_4.0': ['mean'],
    'category_2_5.0': ['mean'],
    'category_3_A': ['mean'],
    'category_3_B': ['mean'],
    'category_3_C': ['mean'],
    'merchant_id': ['nunique'],
    'merchant_category_id': ['nunique'],
    'state_id': ['nunique'],
    'city_id': ['nunique'],
    'subsector_id': ['nunique'],
    'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
    'installments': ['sum', 'mean', 'max', 'min', 'std'],
    'purchase_date': [np.ptp, 'min', 'max'],
    'month_lag': ['mean', 'max', 'min', 'std'],
    'month_diff': ['mean'],
    'month': ['nunique'],
    'hour': ['nunique'],
    'weekofyear': ['nunique'],
    'dayofweek': ['nunique'],
    'year': ['nunique'],
    'authorized_flag': ['sum', 'mean'],
    'weekend': ['sum', 'mean']
    }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

history stores aggregated results from historical transactions



In [None]:
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]
history[:5]

new stores aggregated results from new merchant transactions



In [None]:
new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
new[:5]

**Combine dataframes to train and test dataframes**

joining datasets on the common id, card_id for both train and test



In [None]:
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

history[0::5]

Impute missing values



In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

In [None]:
# get features and remove any that have the incorrect data type for a data frame 
feature_cols = [col for col in train.columns if col not in ['target', 'first_active_month', 'card_id']]
X = train[feature_cols]

# impute missing values
X = my_imputer.fit_transform(X)

# get the target vector
y = train['target']

Split test and training set from train dataframe

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=6)

**Training on any regression models
**

Importing training models



In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

reg_predictions = []

Train using KNeighborsRegressor



In [None]:
myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X_train, y_train)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(X_test)

reg_predictions.append(y_predict_myKNeighborsReg)

Train using DecisionTreeRegressor



In [None]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X_train, y_train)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(X_test)

reg_predictions.append(y_predict_myDecisionTreeReg)

Train using LinearRegression

In [None]:
myLinearReg = LinearRegression()

myLinearReg.fit(X_train, y_train)

y_predict_myLinearReg = myLinearReg.predict(X_test)

reg_predictions.append(y_predict_myLinearReg)

Train using RandomForestRegressor

In [None]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X_train, y_train)

y_predict_myRandomForestReg = myRandomForestReg.predict(X_test)

reg_predictions.append(y_predict_myRandomForestReg)

print(X.shape)

**Check RMSE**

In [None]:
from sklearn import metrics

for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

# **Dimensionality Reduction**

In [None]:
from sklearn.decomposition import PCA
n = 45 # (n is the number of components (new features)
# after dimensionality reduction)
my_pca = PCA(n_components = n)
# (X_Train is feature matrix of training set before DR,
# X_Train_New is feature matrix of training set after DR):
X_Train_new = my_pca.fit_transform(X_train)
X_Test_new = my_pca.transform(X_test)

In [None]:
my_pca.explained_variance_ratio_

In [None]:
my_pca.n_components_

In [None]:
X_train.shape

In [None]:
X_Train_new.shape

In [None]:
X_test.shape

In [None]:
final_test_new.shape

In [None]:
X_Test_new.shape

In [None]:
reg_predictions_new = []

Train using KNeighborsRegressor



myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X_Train_new, y_train)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myKNeighborsReg)

Train using DecisionTreeRegressor

In [None]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X_Train_new, y_train)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myDecisionTreeReg)

Train using LinearRegression

In [None]:
myLinearReg = LinearRegression()

myLinearReg.fit(X_Train_new, y_train)

y_predict_myLinearReg = myLinearReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myLinearReg)

Train using RandomForestRegressor

In [None]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X_Train_new, y_train)

y_predict_myRandomForestReg = myRandomForestReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myRandomForestReg)

print(X.shape)

In [None]:
for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions_new):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

**Repeating above same steps on actual test dataframe**

Training and Testing on all new features



In [None]:
test_feature_cols = [col for col in test.columns if col not in ['target', 'first_active_month', 'card_id']]
final_test = test[feature_cols]
final_test = my_imputer.fit_transform(final_test)

reg_predictions_final = {}

Train using KNeighborsRegressor



In [None]:
myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X, y)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(final_test)

reg_predictions_final['K Nearest Neighbor: '] = y_predict_myKNeighborsReg

Train using DecisionTreeRegressor



In [None]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X, y)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(final_test)

reg_predictions_final['Decision Tree: ']= y_predict_myDecisionTreeReg

Train using LinearRegression

In [None]:
myLinearReg = LinearRegression()

myLinearReg.fit(X, y)

y_predict_myLinearReg = myLinearReg.predict(final_test)

reg_predictions_final['Linear Regression: '] = y_predict_myLinearReg

Train using RandomForestRegressor

In [None]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X, y)

y_predict_myRandomForestReg = myRandomForestReg.predict(final_test)

reg_predictions_final['Random Forest: '] = y_predict_myRandomForestReg

print(X.shape)

In [None]:
for model, y_prediction in reg_predictions_final.items():
    mse = metrics.mean_squared_error(target.iloc[:len(y_prediction)], y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

Training and testing using dimensionality reduction



In [None]:
reg_predictions_final_dr = {}

n = 45

my_pca = PCA(n_components = n)

X_new = my_pca.fit_transform(X)
final_test_new = my_pca.transform(final_test)

Train using KNeighborsRegressor

In [None]:
myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X_new, y)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(final_test_new)

reg_predictions_final_dr['K Nearest Neighbor: '] = y_predict_myKNeighborsReg

Train using DecisionTreeRegressor



In [None]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X_new, y)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(final_test_new)

reg_predictions_final_dr['Decision Tree: ']= y_predict_myDecisionTreeReg

Train using LinearRegression



In [None]:
myLinearReg = LinearRegression()

myLinearReg.fit(X_new, y)

y_predict_myLinearReg = myLinearReg.predict(final_test_new)

reg_predictions_final_dr['Linear Regression: '] = y_predict_myLinearReg

Train using RandomForestRegressor



In [None]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X_new, y)

y_predict_myRandomForestReg = myRandomForestReg.predict(final_test_new)

reg_predictions_final_dr['Random Forest: '] = y_predict_myRandomForestReg

print(X.shape)

In [None]:
for model, y_prediction in reg_predictions_final_dr.items():
    mse = metrics.mean_squared_error(target.iloc[:len(y_prediction)], y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

# **Applying Advanced Models for Boosting prediction **

In [None]:
import pandas as pd
import numpy as np
import math
import datetime

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from mlxtend.regressor import StackingCVRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
sns.set(style='white', context='notebook', palette='deep')

In [None]:
df_train = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/train.csv"))
df_test = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/test.csv"))
df_historical =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/historical_transactions.csv",parse_dates=['purchase_date']))
df_new =reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/new_merchant_transactions.csv",parse_dates=['purchase_date']))

In [None]:
df_train = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/train.csv",parse_dates=['first_active_month']))
df_test = reduce_mem_usage(pd.read_csv("../input/elo-merchant-category-recommendation/test.csv",parse_dates=['first_active_month']))

In [None]:
# Historical and new transactions data
hist_trans = df_historical
new_trans = df_new

# Train and Test data
train = df_train
test = df_test

train_idx = train.shape[0]
test_idx = test.shape[0]

print("--------------------------")
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)
print("--------------------------")
print("Historical transactions shape: ", hist_trans.shape)
print("New transactions shape: ", new_trans.shape)

# Description

In [None]:
print("----------------------------------------------------------------")
print("Train")
print("----------------------------------------------------------------")
print(train.info())
print("\n----------------------------------------------------------------")
print("Test")
print("----------------------------------------------------------------")
print(train.info())
print("\n----------------------------------------------------------------")
print("Historical transactions")
print("----------------------------------------------------------------")
print(hist_trans.info())
print("\n----------------------------------------------------------------")
print("New transactions")
print("----------------------------------------------------------------")
print(new_trans.info())

In [None]:
train.head()

In [None]:
test.head()


# Target column¶


In [None]:
print("Target description:\n\n", train['target'].describe())
print("\n--------------------------------------------------------------------------------------------")
print("\nTarget values:\n\n", train['target'].value_counts())

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,6))
ax1, ax2 = axes.flatten()

# Distribution
sns.distplot(train['target'], ax=ax1, color='Green')

# Sorted correlations with target
sorted_corrs = train.corr()['target'].sort_values(ascending=False)
sns.heatmap(train[sorted_corrs.index].corr(), ax=ax2)

ax1.set_title('Target Distribution')
ax2.set_title('Correlations')
plt.show()
del sorted_corrs

There seem to be 2207 values around -33 for the target column, which follows a normal distribution. Let's take that into account. Also, 'feature_3' correlates better than 'feature_2' and 'feature_1' with 'target'.

Let's confirm the number of values under -30.

In [None]:
under_30 = train.loc[train['target'] < -30, 'target'].count()
print("Under -30:", under_30, "values.")

# Cleaning

**Missing**

In [None]:
print("MISSING VALUES BEFORE CLEANING\n")
print("--------------------------------------------------\nTrain:\n--------------------------------------------------\n", train.isnull().sum())
print("\n--------------------------------------------------\nTest:\n--------------------------------------------------\n", test.isnull().sum())
print("\n--------------------------------------------------\nHistorical transactions:\n--------------------------------------------------\n", hist_trans.isnull().sum())
print("\n--------------------------------------------------\nNew transactions:\n--------------------------------------------------\n", new_trans.isnull().sum())

# Train/Test¶

There's no null values for train. However, there seem to be one observation with a missing 'first_active_month' in test.



In [None]:
test_missing = test[test.isnull()['first_active_month']]
idx_test_missing = test_missing.index
test_missing

We can look for all the observations that match the same 'feature_1', 'feature_2' and 'feature_3' values as that, and replace it with the 'first_active_month' that corresponds to their mode.

In [None]:
same_category = test[(test['feature_1'] == 5) & (test['feature_2'] == 2) & (test['feature_3'] == 1)]
test.loc[idx_test_missing, 'first_active_month'] = same_category['first_active_month'].mode()[0]

del same_category
test.iloc[11578]

Historical transactions¶


Similarly, in this case let's drop the missing rows for 'category_3' and 'merchant_id' (less than 1% of total).



In [None]:
hist_trans.dropna(subset=['category_3', 'merchant_id'], inplace=True)


For 'category_2', since it has about 10% of missing values, let's replace them with the rounded average value (since values for this column include [1.0, 2.0, 3.0, 4.0, 5.0]), as seen below.

In [None]:
hist_trans['category_2'].describe()

In [None]:
print(hist_trans[hist_trans['category_2'].isnull()])

In [None]:
hist_trans['category_2'] = pd.to_numeric(hist_trans['category_2'], errors='coerce')

In [None]:
hist_trans = hist_trans.dropna(subset=['category_2'])

In [None]:
hist_trans['category_2'] = hist_trans['category_2'].astype(int)

In [None]:
hist_trans['category_2'].fillna((math.floor(hist_trans['category_2'].mean())), inplace=True)

# New transactions¶
Once more, since there are missing values in 'category_3', 'merchant_id', 'category_2' and they add up to no more than about 5%, let's drop the corresponding rows.

In [None]:
new_trans.dropna(inplace=True)

Lastly, let's confirm no null values are present after cleaning.


In [None]:
print("MISSING VALUES AFTER CLEANING\n")
print("--------------------------------------------------\nTrain:\n--------------------------------------------------\n", train.isnull().sum())
print("\n--------------------------------------------------\nTest:\n--------------------------------------------------\n", test.isnull().sum())
#print("\n--------------------------------------------------\nMerchant:\n--------------------------------------------------\n", merchants.isnull().sum())
print("\n--------------------------------------------------\nHistorical transactions:\n--------------------------------------------------\n", hist_trans.isnull().sum())
print("\n--------------------------------------------------\nNew transactions:\n--------------------------------------------------\n", new_trans.isnull().sum())

# Feature engineering

In [None]:
# Merge train and test for data processing
data = pd.concat([train, test], ignore_index=True)

# Check shapes match
print("Train ({}) + Test ({}) observations: {}".format(train.shape[0], test.shape[0], train.shape[0] + test.shape[0]))
print("Merged shape:", data.shape)

del train
del test

# Train/Test¶

In [None]:
# Year and month, separately
data['year'] = data['first_active_month'].dt.year
data['month'] = data['first_active_month'].dt.month

# Elapsed time, until the latest date on the dataset
data['elapsed_time'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days

# Categorical features: 'feature_1', 'feature_2' and 'feature_3'
cont = 1
for col in ['feature_1', 'feature_2', 'feature_3']:
    dummy_col = pd.get_dummies(data[col], prefix='f{}'.format(cont))
    data = pd.concat([data, dummy_col], axis=1)
    data.drop(col, axis=1, inplace=True)
    cont += 1
    
data.head()

# Historical + New transactions
Let's create a column called 'new' on 'hist_trans' and 'new_trans' such that, before concatening them, they have the age reference:

* 1: New
* 0: Historical

In [None]:
new_trans['new'] = 1
hist_trans['new'] = 0

# Concatenate new_trans and hist_trans
trans_data = pd.concat([new_trans, hist_trans])

del new_trans
del hist_trans

More preprocessing: 'category_1', 'category_2' and 'category_3'.



In [None]:
# Change Yes/No for 0/1 in 'authorized_flag' and 'category_1'
yes_no_dict = {'Y':1, 'N':0}
trans_data['authorized_flag'] = trans_data['authorized_flag'].map(yes_no_dict)
trans_data['category_1'] = trans_data['category_1'].map(yes_no_dict)

# Create five different cols for 'category_2'
dummy_col = pd.get_dummies(trans_data['category_2'], prefix='category_2')
trans_data = pd.concat([trans_data, dummy_col], axis=1)
trans_data.drop('category_2', axis=1, inplace=True)
    
# Create three different cols for categorical A/B/C in 'category_3'
dummy_col = pd.get_dummies(trans_data['category_3'], prefix='cat3')
trans_data = pd.concat([trans_data, dummy_col], axis=1)
trans_data.drop('category_3', axis=1, inplace=True)

trans_data.head()

Aggregate function¶
Aggregate function, grouped by 'card_id': min, max, mean, median, std, sum, nunique, range.

Added:
* Count on 'installments' and 'purchase_amount'.
* Mode() on 'new' column (previously created).
* Mean on new trans_data's category_2 dummy columns.
* Mean on trans_data's category_4.
* Mean on 'cat3_A', 'cat3_B' and 'cat3_C' (old 'category_3').
* Mean on merchants' new dummy columns.

In [None]:
def aggregate_historical_transactions(trans_data):
    
    trans_data.loc[:, 'purchase_date'] = pd.DatetimeIndex(trans_data['purchase_date']).astype(np.int64)*1e-9
    
    agg_func = {
        'authorized_flag': ['sum', 'mean'],
        'category_1': ['mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'cat3_A': ['mean'],
        'cat3_B': ['mean'],
        'cat3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['count', 'sum', 'median', 'max', 'min', 'std'],
        'installments': ['count', 'sum', 'median', 'max', 'min', 'std'],
        'purchase_date': [np.ptp],
        'month_lag': ['min', 'max'],
        'new':[lambda x:x.value_counts().index[0]] # Mode
        }
    
    agg_history = trans_data.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)

    df = (trans_data.groupby('card_id').size().reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

trans_data = aggregate_historical_transactions(trans_data)
trans_data.head()

# Data preparation

# Merge

In [None]:
# Merge data (train + test) with trans_data (historical + new transactions)
processed_data = pd.merge(data, trans_data, on='card_id', how='left')
del data
del trans_data
print(processed_data.shape)
processed_data.head()

# Final Train/Test¶


In [None]:
# Train and Test
train = processed_data[:train_idx]
test = processed_data[train_idx:]

del processed_data

# There are some nan values after feature eng in 'purchase_amount_std' and 'installments_std'
cols = ['purchase_amount_std', 'installments_std']

for col in cols:
    train[col].fillna((train[col].value_counts().index[0]), inplace=True)
    test[col].fillna((test[col].value_counts().index[0]), inplace=True)

target = train['target']

cols_2_remove = ['target', 'card_id', 'first_active_month']
for col in cols_2_remove:  
    del train[col]
    del test[col] 

# Check on shapes
print("--------------------------")
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)
print("--------------------------")

# Modeling and Testing

# ****LightGBM****

In [None]:
lgb_params = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1}

FOLDs = KFold(n_splits=5, shuffle=True, random_state=1989)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

features_lgb = list(train.columns)
feature_importance_df_lgb = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train)):
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx])

    print("LGB " + str(fold_) + "-" * 50)
    num_round = 2000
    clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 2000)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)

    fold_importance_df_lgb = pd.DataFrame()
    fold_importance_df_lgb["feature"] = features_lgb
    fold_importance_df_lgb["importance"] = clf.feature_importance()
    fold_importance_df_lgb["fold"] = fold_ + 1
    feature_importance_df_lgb = pd.concat([feature_importance_df_lgb, fold_importance_df_lgb], axis=0)
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / FOLDs.n_splits
    

del fold_importance_df_lgb
del trn_data
del val_data

print(np.sqrt(mean_squared_error(oof_lgb, target)))

# Xgboost

In [None]:
import xgboost as xgb

In [None]:
train.rename(index=str, columns={"new_<lambda>": "new_mode"}, inplace=True)
test.rename(index=str, columns={"new_<lambda>": "new_mode"}, inplace=True)

xgb_params = { 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'eta':1}

FOLDs = KFold(n_splits=5, shuffle=True, random_state=1989)

oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train)):
    trn_data = xgb.DMatrix(data=train.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=train.iloc[val_idx], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("xgb " + str(fold_) + "-" * 50)
    num_round = 2000
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=100, verbose_eval=200)
    oof_xgb[val_idx] = xgb_model.predict(xgb.DMatrix(train.iloc[val_idx]), ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb += xgb_model.predict(xgb.DMatrix(test), ntree_limit=xgb_model.best_ntree_limit+50) / FOLDs.n_splits

del trn_data
del val_data
del watchlist

np.sqrt(mean_squared_error(oof_xgb, target))

# Summary of results¶


In [None]:
print("-----------------\nScores on train\n-----------------")
print('lgb:', np.sqrt(mean_squared_error(oof_lgb, target)))
print('xgb:', np.sqrt(mean_squared_error(oof_xgb, target)))

total_sum = 0.5*oof_lgb + 0.5*oof_xgb

print("CV score: {:<8.5f}".format(mean_squared_error(total_sum, target)**0.5))

# Feature importance¶


In [None]:
cols = (feature_importance_df_lgb[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df_lgb.loc[feature_importance_df_lgb.feature.isin(cols)]

plt.figure(figsize=(14,14))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')
del feature_importance_df_lgb

# Ensembled model: averaged and stacked¶
The follwing tests lgbm, xgb, catboost, random forest, decision tree, knn, ridge and lasso models individual performance, and compared for averaged and stacked models.

# Model definition


In [None]:
# Model definition
train_y = target

# Same lgbm and xgb models as before
lgbm_model = LGBMRegressor(
                objective="regression", metric="rmse", 
                max_depth=7, min_child_samples=20, 
                reg_alpha= 1, reg_lambda=1,
                num_leaves=64, learning_rate=0.001, 
                subsample=0.8, colsample_bytree=0.8, 
                verbosity=-1
)

xgb_model = XGBRegressor(
                eta=0.001, max_depth=7, 
                subsample=0.8, colsample_bytree=0.8, 
                objective='reg:linear', eval_metric='rmse', 
                silent=True
)


# Test catboost, random forest, decision tree, knn, ridge and lasso models individual performance, for averaged and stacked model
catboost_model = CatBoostRegressor(iterations=150)
rf_model = RandomForestRegressor(n_estimators=25, min_samples_leaf=25, min_samples_split=25)
tree_model = DecisionTreeRegressor(min_samples_leaf=25, min_samples_split=25)
knn_model = KNeighborsRegressor(n_neighbors=25, weights='distance')
ridge_model = Ridge(alpha=75.0)
lasso_model = Lasso(alpha=0.75)

# ------------------------------------------------------------------------------------------------
# Average regressor
class AveragingRegressor(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, regressors):
        self.regressors = regressors
        self.predictions = None

    def fit(self, X, y):
        for regr in self.regressors:
            regr.fit(X, y)
        return self

    def predict(self, X):
        self.predictions = np.column_stack([regr.predict(X) for regr in self.regressors])
        return np.mean(self.predictions, axis=1)
    
# Averaged & stacked models 
averaged_model = AveragingRegressor([catboost_model, xgb_model, rf_model, lgbm_model])


stacked_model = StackingCVRegressor(
    regressors=[catboost_model, xgb_model, rf_model, lgbm_model],
    meta_regressor=Ridge()
)

# Test performance
def rmse_fun(predicted, actual):
    return np.sqrt(np.mean(np.square(predicted - actual)))

rmse = make_scorer(rmse_fun, greater_is_better=False)

models = [
     ('CatBoost', catboost_model),
     ('XGBoost', xgb_model),
     ('LightGBM', lgbm_model),
     ('DecisionTree', tree_model),
     ('RandomForest', rf_model),
     ('Ridge', ridge_model),
     ('Lasso', lasso_model),
     ('KNN', knn_model),
     ('Averaged', averaged_model),
     ('Stacked', stacked_model),
]


scores = [
    -1.0 * cross_val_score(model, train.values, train_y.values, scoring=rmse).mean()
    for _,model in models
]

In [None]:
dataz = pd.DataFrame({ 'Model': [name for name, _ in models], 'Error (RMSE)': scores })
dataz.plot(x='Model', kind='bar')
plt.savefig('stacked_scores.png')

# Results

In [None]:
dataz

# Predictions

In [None]:
# Stacked model predictions (best score)
stacked_model.fit(train.values, target.values)    
predictions_stacked = stacked_model.predict(test.values)

Submission

In [None]:
# LightGBM/Xgboost
sub_df = pd.read_csv('../input/sample_submission.csv')
sub_df["target"] = 0.5 * predictions_lgb + 0.5 * predictions_xgb
sub_df.to_csv("submission_lgbxgboost.csv", index=False)

# Stacked
sub_df = pd.read_csv('../input/sample_submission.csv')
sub_df["target"] = predictions_stacked
sub_df.to_csv("submission_stacked.csv", index=False)