In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [29]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [3]:
df = pd.read_csv('train.csv')
sub = pd.read_csv('sample_submission.csv')
cuttoff_date = '2020-03-05'

In [4]:
df['id'] = df['user_id'].astype(str) + ";" + df["cart"].astype(str)
df = df.drop(['user_id', 'cart'], axis=1)
df['order_completed_at'] = pd.to_datetime(df['order_completed_at'])
df_train = df[(df.order_completed_at < cuttoff_date)].reset_index(drop=True)
df_test = df[(df.order_completed_at >= cuttoff_date)].reset_index(drop=True)

In [5]:
clients = pd.DataFrame(df_train['id'].unique())
clients.columns = ['id']

In [6]:
# Расчёт максимальноц даты покупки
tx_max_purchase = df_train.groupby('id').order_completed_at.max().reset_index()
tx_max_purchase.columns = ['id','MaxPurchaseDate']
tx_min_purchase = df_train.groupby('id').order_completed_at.min().reset_index()
tx_min_purchase.columns = ['id','MinPurchaseDate']
tx_min_purchase = pd.merge(tx_min_purchase,tx_max_purchase,on='id',how='left')
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
clients = pd.merge(clients, tx_max_purchase[['id','Recency']], on='id')

In [7]:
# Расчёт мимимаальной даты покупки
tx_min_purchase['NextPurchaseDay'] = (tx_min_purchase['MaxPurchaseDate'] - tx_min_purchase['MinPurchaseDate']).dt.days
clients = pd.merge(clients, tx_min_purchase[['id','NextPurchaseDay']], on='id')

In [8]:
# Расчёт частоты
tx_frequency = df_train.groupby('id').order_completed_at.count().reset_index()
tx_frequency.columns = ['id','Frequency']
clients = pd.merge(clients, tx_frequency, on='id')

In [9]:
# Function order_cluster
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [10]:
#clustering for Recency
kmeans = KMeans(n_clusters=4)
kmeans.fit(clients[['Recency']])
clients['RecencyCluster'] = kmeans.predict(clients[['Recency']])
clients = order_cluster('RecencyCluster', 'Recency', clients, False)

In [11]:
#clustering for Frequency
kmeans = KMeans(n_clusters=4)
kmeans.fit(clients[['Frequency']])
clients['FrequencyCluster'] = kmeans.predict(clients[['Frequency']])
clients = order_cluster('FrequencyCluster', 'Frequency', clients, True)

In [12]:
#building overall segmentation
clients['OverallScore'] = clients['RecencyCluster'] + clients['FrequencyCluster']

#assign segment names
clients['Segment'] = 'Low-Value'
clients.loc[clients['OverallScore']>2,'Segment'] = 'Mid-Value' 
clients.loc[clients['OverallScore']>4,'Segment'] = 'High-Value' 

In [13]:
tx_class = clients.copy()
tx_class = tx_class.set_index('id')
tx_class['NextPurchaseDayRange'] = 3  ## less than 6 months
tx_class.loc[tx_class.NextPurchaseDay>180,'NextPurchaseDayRange'] = 2 ## more than 6 months
tx_class.loc[tx_class.NextPurchaseDay>365,'NextPurchaseDayRange'] = 1 ## more than 12 months

In [14]:
#train & test split
tx_class = tx_class.drop('NextPurchaseDay',axis=1)
tx_class = tx_class.drop('Segment',axis=1)
X, y = tx_class.drop('NextPurchaseDayRange',axis=1), tx_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [15]:
xgb_model = xgb.XGBClassifier().fit(X_train, y_train)





In [24]:
y_pred = xgb_model.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.60      0.15      0.25      1406
           2       0.49      0.06      0.10      3355
           3       0.94      1.00      0.97     70736

    accuracy                           0.94     75497
   macro avg       0.68      0.40      0.44     75497
weighted avg       0.92      0.94      0.92     75497



[3 3 3 ... 3 3 3]
