In [1]:
import numpy as np
from datetime import datetime
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, precision_score, precision_recall_curve, recall_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer

In [2]:
#Import the data 
# read all tables 
customer_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagement_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

In [3]:
# deal with gender null 
# deal with age null
customer_final.dropna(subset=['gender', 'age'], inplace=True)

In [4]:
customer_final['join_date'] = pd.to_datetime(customer_final['join_date'])
customer_final['last_purchase_date'] = pd.to_datetime(customer_final['last_purchase_date'])
transactions_final['transaction_date'] = pd.to_datetime(transactions_final['transaction_date'])
marketing_final['campaign_date'] = pd.to_datetime(marketing_final['campaign_date'])

feature engineering 

In [5]:
clv_final = transactions_final.groupby('customer_id')['transaction_amount'].sum().reset_index()
clv_final.rename(columns={'transaction_amount': 'clv'}, inplace=True)

In [6]:
clv_final

Unnamed: 0,customer_id,clv
0,1,3509.48
1,2,6081.32
2,3,1454.87
3,4,7874.68
4,5,15524.55
...,...,...
9995,9996,5498.20
9996,9997,5848.30
9997,9998,3503.13
9998,9999,6721.86


In [7]:
customer_final['customer_lifetime'] = (customer_final['last_purchase_date'] - customer_final['join_date']).dt.days

In [8]:
customer_final

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,customer_lifetime
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,118
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,243
5,6,2022-01-01,2023-02-10,55.0,Female,North Richardfort,405
7,8,2023-02-09,2023-08-09,68.0,Male,Marquezton,181
8,9,2022-01-24,2023-08-15,68.0,Female,West Franciscobury,568
...,...,...,...,...,...,...,...
9995,9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,240
9996,9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,200
9997,9998,2023-09-17,2024-01-30,39.0,Male,New John,135
9998,9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,66


In [16]:
transactions_aggr = transactions_final.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [17]:
transactions_aggr

Unnamed: 0_level_0,transaction_id,transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3509.48
2,9,6081.32
3,6,1454.87
4,20,7874.68
5,24,15524.55
...,...,...
9996,10,5498.20
9997,12,5848.30
9998,3,3503.13
9999,12,6721.86


In [22]:

engagement_final.set_index('customer_id', inplace=True)
marketing_agg = marketing_final.groupby('customer_id').size().reset_index(name='number_of_campaigns')
marketing_agg.set_index('customer_id', inplace=True)


In [25]:
#join 
joined_df = customer_final.join(clv_final).join(transactions_aggr).join(engagement_final).join(marketing_agg)


In [26]:
joined_df

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,customer_lifetime,clv,transaction_id,transaction_amount,number_of_site_visits,number_of_emails_opened,number_of_clicks,number_of_campaigns
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,118,3509.48,6,3509.48,10,15,1,4
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,243,7874.68,20,7874.68,110,30,17,4
6,2022-01-01,2023-02-10,55.0,Female,North Richardfort,405,10254.44,17,10254.44,74,59,7,3
8,2023-02-09,2023-08-09,68.0,Male,Marquezton,181,11710.62,15,11710.62,60,22,7,3
9,2022-01-24,2023-08-15,68.0,Female,West Franciscobury,568,10382.96,22,10382.96,49,43,21,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,240,5498.20,10,5498.20,119,47,16,3
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,200,5848.30,12,5848.30,3,33,14,1
9998,2023-09-17,2024-01-30,39.0,Male,New John,135,3503.13,3,3503.13,53,17,5,1
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,66,6721.86,12,6721.86,23,5,4,2


In [28]:
print(joined_df.isnull().sum())

join_date                  0
last_purchase_date         0
age                        0
gender                     0
location                   0
customer_lifetime          0
clv                        0
transaction_id             0
transaction_amount         0
number_of_site_visits      0
number_of_emails_opened    0
number_of_clicks           0
number_of_campaigns        0
dtype: int64


In [29]:
joined_df['normalized_clv'] = joined_df['clv'] / joined_df['customer_lifetime']


In [30]:
joined_df

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,customer_lifetime,clv,transaction_id,transaction_amount,number_of_site_visits,number_of_emails_opened,number_of_clicks,number_of_campaigns,normalized_clv
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,118,3509.48,6,3509.48,10,15,1,4,29.741356
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,243,7874.68,20,7874.68,110,30,17,4,32.406091
6,2022-01-01,2023-02-10,55.0,Female,North Richardfort,405,10254.44,17,10254.44,74,59,7,3,25.319605
8,2023-02-09,2023-08-09,68.0,Male,Marquezton,181,11710.62,15,11710.62,60,22,7,3,64.699558
9,2022-01-24,2023-08-15,68.0,Female,West Franciscobury,568,10382.96,22,10382.96,49,43,21,2,18.279859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,240,5498.20,10,5498.20,119,47,16,3,22.909167
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,200,5848.30,12,5848.30,3,33,14,1,29.241500
9998,2023-09-17,2024-01-30,39.0,Male,New John,135,3503.13,3,3503.13,53,17,5,1,25.949111
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,66,6721.86,12,6721.86,23,5,4,2,101.846364


don't need clv datafram can just chanfe transaction amount name  to clv

In [53]:
x = joined_df['age', 'gender', 'number_of_site_visits', 'number_of_emails_opened',
            'number_of_clicks', 'number_of_campaigns', 'transaction_id',]
y = ['normalized_clv']




KeyError: ('age', 'gender', 'number_of_site_visits', 'number_of_emails_opened', 'number_of_clicks', 'number_of_campaigns', 'transaction_id')