In [1]:
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, precision_score, precision_recall_curve, recall_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from scipy.stats.mstats import winsorize

In [2]:
#Import the data 
# read all tables 
customer_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagement_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions_final = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

In [3]:
# deal with gender null 
# deal with age null
customer_final.dropna(subset=['gender', 'age'], inplace=True)

In [4]:
customer_final['join_date'] = pd.to_datetime(customer_final['join_date'])
customer_final['last_purchase_date'] = pd.to_datetime(customer_final['last_purchase_date'])
transactions_final['transaction_date'] = pd.to_datetime(transactions_final['transaction_date'])
marketing_final['campaign_date'] = pd.to_datetime(marketing_final['campaign_date'])

In [5]:
#creating CLV dataset 
clv_final = transactions_final.groupby('customer_id')['transaction_amount'].sum().reset_index()
clv_final.rename(columns={'transaction_amount': 'clv'}, inplace=True) 


In [6]:
transactions_agg = transactions_final.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [7]:
marketing_agg = marketing_final.groupby('customer_id').size().reset_index(name='number_of_campaigns')
marketing_agg.set_index('customer_id', inplace=True)

In [8]:
engagement_final.set_index('customer_id', inplace=True)
customer_final.set_index('customer_id', inplace=True)

In [9]:
clv_final.set_index('customer_id', inplace=True)

In [10]:
joint_dataa = customer_final.join(engagement_final).join(transactions_agg).join(marketing_agg).join(clv_final)

In [11]:
joint_dataa['customer_lifetime'] = (joint_dataa['last_purchase_date'] - joint_dataa['join_date']).dt.days

In [12]:
joint_dataa

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,number_of_campaigns,clv,customer_lifetime
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,4,3509.48,118
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,4,7874.68,243
6,2022-01-01,2023-02-10,55.0,Female,North Richardfort,74,59,7,17,10254.44,3,10254.44,405
8,2023-02-09,2023-08-09,68.0,Male,Marquezton,60,22,7,15,11710.62,3,11710.62,181
9,2022-01-24,2023-08-15,68.0,Female,West Franciscobury,49,43,21,22,10382.96,2,10382.96,568
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,119,47,16,10,5498.20,3,5498.20,240
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,3,33,14,12,5848.30,1,5848.30,200
9998,2023-09-17,2024-01-30,39.0,Male,New John,53,17,5,3,3503.13,1,3503.13,135
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,23,5,4,12,6721.86,2,6721.86,66


have two colujmsn with same trans amount and clv 

In [18]:
# Normalize CLV by customer lifetime
joint_dataa['normalized_clv'] = joint_dataa['clv'] / joint_dataa['customer_lifetime']

# Winsorize  to handle zeros and extreme values
joint_dataa['normalized_clv'] = winsorize(joint_dataa['normalized_clv'], limits=[0.01, 0.01])


In [19]:
zero_lifetime_customers = joint_dataa[joint_dataa['customer_lifetime'] == 0]

# Display the count
zero_lifetime_count = zero_lifetime_customers.shape[0]
zero_lifetime_count

0

In [20]:
# Feature selection
features = ['age', 'gender','number_of_site_visits', 'number_of_emails_opened',
            'number_of_clicks', 'number_of_campaigns', 'transaction_id']
target = 'normalized_clv'



In [16]:
# One-hot encode the 'gender' column
joint_dataa = pd.get_dummies(joint_dataa, columns=['gender'], drop_first=True)

In [17]:
# Convert to integer if necessary
joint_dataa['gender_Male'] = joint_dataa['gender_Male'].astype(int)

In [21]:
# Adjusted feature selection
features = ['age', 'number_of_site_visits', 'number_of_emails_opened',
            'number_of_clicks', 'number_of_campaigns', 'transaction_id', 'gender_Male']
target = 'normalized_clv'

In [22]:
# Scale numerical features
scaler = MinMaxScaler()
joint_dataa[features] = scaler.fit_transform(joint_dataa[features])

In [23]:
# Define the feature set and the target variable
X = joint_dataa[features]
y = joint_dataa[target]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
# Model selection and training
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)