In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import colors as mcolors
import plotly.graph_objects as go
from scipy.stats import linregress
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans
from tabulate import tabulate
import plotly.figure_factory as ff
from collections import Counter
%matplotlib inline

In [4]:
df = pd.read_csv('data/marketing_campaign.csv', sep='\t')

FileNotFoundError: [Errno 2] No such file or directory: 'marketing_campaign.csv'

In [None]:
df.head()

In [None]:
df = df.rename(columns={'Recency': 'Last_Purchase','Dt_Customer':'Enrollment_Date','Response':'AcceptedCmp6'})

In [None]:
df['Enrollment_Date'] = pd.to_datetime(df['Enrollment_Date'], format='%d-%m-%Y')

In [None]:
df = df.drop(columns=['Z_CostContact','Z_Revenue'])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df

In [None]:
# generating profile report
# profile = ProfileReport(df)

In [None]:
# profile

#### FEATURE ENGINEERING

In [None]:
df['Age'] = 2014- df['Year_Birth']

In [None]:
df['Age'] 

In [None]:
df['Age'].describe()

#### Feature Transformations

In [None]:
df['Education'] = df['Education'].replace({'Basic': 0,'Graduation':1, '2n Cycle':2, 'Master':2, 'PhD':3})

In [None]:
df['Marital_Status'] = df['Marital_Status'].replace({'Married':2, 'Together':2, 'Widow':1,'Divorced':1,'Alone':1,'Absurd':1,'YOLO':1, 'Single':1})

#### Feature Extraction

In [None]:
df['Members'] = df[['Kidhome', 'Teenhome', 'Marital_Status']].sum(axis='columns')

In [None]:
print("Customers' enrollment date starts wih {} and end at {} in the data set.".format(df['Enrollment_Date'].min().strftime('%Y-%m-%d'),  df['Enrollment_Date'].max().strftime('%Y-%m-%d')))
df['Enrollment_Duration'] = (df.loc[200]['Enrollment_Date'] - df['Enrollment_Date']).dt.days

In [None]:
df['Campaign_Acceptance'] = df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp6']].sum(axis='columns')

In [None]:
df['Num_of_Transactions'] = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis='columns')

In [None]:
df['Total_Expenditure'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis='columns')

In [None]:
# average expenditure
df['Average_Expenditure'] = round(df['Total_Expenditure'] / df['Num_of_Transactions'],1)

In [None]:
# average transactions after enrollment
df['Average_Days'] = round(df['Enrollment_Duration'] / df['Num_of_Transactions'],1)

In [None]:
# number of customers did not who have 0 transaction
df[df['Num_of_Transactions'] == 0]

In [None]:
df = df.dropna()

In [None]:
df.shape

In [None]:
# removing records with a 0 Num_of_Transactions value to avoid the potential error
df = df[df['Num_of_Transactions'] > 0]

In [None]:
# filtration of income columns from long dispersed values
df = df[df['Income']<300000]

In [None]:
data = pd.DataFrame(df, columns=['ID', 'Age', 'Education', 'Members', 'Income', 'Last_Purchase', 'Enrollment_Duration', 'Campaign_Acceptance', 'Average_Days',  'Average_Expenditure', 'Num_of_Transactions', 'Total_Expenditure', 'Complain'])

#### Dealing with Outliers 

In [None]:
# automated outlier ditection algorithm
model=IsolationForest(n_estimators=150, max_samples='auto', contamination=float(0.03), max_features=1.0)

# fit IsolationForest
model.fit(data.iloc[:, 1:])

# array containing the score of being outlier
scores=model.decision_function(data.iloc[:, 1:])

# storing the aanomalies
anomaly=model.predict(data.iloc[:, 1:])

# adding the anomalies and score into dataframe
data['scores']=scores
data['anomaly']=anomaly

data.head()

In [None]:
anomaly = data.loc[data['anomaly']==-1]
anomaly_index = list(anomaly.index)
print('Total number of outliers is:', len(anomaly))

In [None]:
# dropping outliers
data = data.drop(anomaly_index, axis = 0).reset_index(drop=True)

# drop 'scores' and 'anomaly' column
data = data.drop(columns=['scores', 'anomaly'])

In [None]:
data.shape

#### FEATURE SCALING

In [None]:
# initialize the StandardScaler
df= data.copy()
scaler = StandardScaler()

# list of columns that don't need to be scaled
cols_except = ['ID']

# list of columns that need to be scaled
columns_to_scale = df.columns.difference(cols_except)

# copy the cleaned dataset
df_scaled = df.copy()

# applying the scaler to the necessary columns in the dataset
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# display the first few rows of the scaled data
df_scaled.head()

In [None]:
# setting ID as the index column
df_scaled.set_index('ID', inplace=True)

pca = PCA().fit(df_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

pca = PCA(n_components= 6)
reduced_dim=pca.fit_transform(df_scaled)

df_scaled_pca= pd.DataFrame(data = reduced_dim, columns = ['pc1', 'pc2',
                                                        'pc3','pc4',
                                                        'pc5','pc6'])
# checking the dataset
df_scaled_pca.head()

In [None]:
pca_df = df_scaled_pca.copy()

In [None]:
# set plot style, and background color
sns.set(style='darkgrid', rc={'axes.facecolor': '#78C3B0'})

# set the color palette for the plot
sns.set_palette(['green'])

# creating a Figure and Axes for Plotting
fig, ax = plt.subplots(figsize=(12, 6))

# set up the clustering object with the defined parameters
km = KMeans(random_state=42)

# create a KElbowVisualizer object with the clustering model,
visualizer = KElbowVisualizer(km, k=(2,10),locate_elbow=True, timings=False)

# fit the data to the visualizer
visualizer.fit(df_scaled_pca)

# finalize and render the figure
visualizer.show()    

In [None]:
kmeans = KMeans(n_clusters=2, max_iter=300, n_init=20, random_state = 180)
kmeans.fit(df_scaled_pca) 

# adding the labels of cluster to the dataframe of pca
df_scaled_pca['labels2'] = kmeans.labels_
Counter(kmeans.labels_)

In [None]:
colors = ['#e8000b','#33ff36']

In [None]:
sns.set(rc={'axes.facecolor': '#78C3B0'}, style='darkgrid')
# Create separate data franomalyames for each cluster
cluster_0 = df_scaled_pca[df_scaled_pca['labels2'] == 0]
cluster_1 = df_scaled_pca[df_scaled_pca['labels2'] == 1]

# create a 3D scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter3d(x=cluster_0['pc1'], y=cluster_0['pc2'], z=cluster_0['pc3'], 
                           mode='markers', marker=dict(color=colors[0], size=5, opacity=0.4), name='Cluster 0'))
fig.add_trace(go.Scatter3d(x=cluster_1['pc1'], y=cluster_1['pc2'], z=cluster_1['pc3'], 
                           mode='markers', marker=dict(color=colors[1], size=5, opacity=0.4), name='Cluster 1'))


fig.update_layout(
    title=dict(text='3D Visualization of Customer Clusters in PCA Space', x=0.5),
    scene=dict(
        xaxis=dict(backgroundcolor="#00C795", gridcolor='white', title='PC1'),
        yaxis=dict(backgroundcolor="#00C795", gridcolor='white', title='PC2'),
        zaxis=dict(backgroundcolor="#00C795", gridcolor='white', title='PC3'),
    ),
    width=900,
    height=800
)

# show the plot
fig.show()

In [None]:
num_observations = len(df_scaled_pca)
X = df_scaled_pca.drop('labels2', axis=1)
label = df_scaled_pca['labels2']

sil_score = silhouette_score(X, label)

In [None]:
table_data = [
    ["Number of Observations", num_observations],
    ["Silhouette Score", sil_score],
]

print(tabulate(table_data, headers=["Metric", "Value"], tablefmt='pretty'))

#### Choosing the model for Clusters

In [None]:
data['labels2'] = df_scaled_pca['labels2'].astype('object')

In [None]:
import plotly.express as px
import plotly.io as pio

barchart = px.bar(
    data_frame = data,
    x = 'Num_of_Transactions',
    y = 'Total_Expenditure',
    color = 'labels2',
    opacity = 1,
    orientation = 'v',
    barmode = 'overlay',
    facet_row = 'labels2',
    labels = {'labels2': 'Labels', 'Num_of_Transactions': 'Number of Transactions', 
              'Total_Expenditure':'Total Expenditure'},
    title='Relationship of Transaction and Expenditure',
    width= 900, height= 500,
    template= 'plotly_dark'
)
barchart.update_layout(legend = {'x':10, 'y':1.0},font=dict(size=9,color="white"))
barchart.update_traces(marker_line_width=0).update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
pio.show(barchart)


Cluster 0: top tier customer revenue and transactions number both are high

Cluster 1: shares less revenue and less transactions

#### Experimentation with Clusters

In [None]:
import plotly.express as px
import plotly.io as pio

barchart = px.bar(
    data_frame = data,
    x = 'Education',
    y = 'Total_Expenditure',
    color = 'labels2',
    barmode ='group',
    title='Relationship of Education and Total_Expenditure',
    width= 900, height= 500,
    template= 'plotly_dark',
    hover_name='Age', 
)
    
barchart.update_traces(marker_line_width = 0,
                  selector=dict(type="bar"))

pio.show(barchart)

In [None]:
data.groupby('Education')['Total_Expenditure'].sum()

##### From the above graph and code output it is evident that more than half of the transactions or revenues come from the Graduates and larger share of these customers belongs to cluster 0. Next to Graduates, subsequently the larger larger share of revenues from customers who done Masters and phd.

In [None]:
import plotly.express as px
import plotly.io as pio

scatter = px.scatter(
    data_frame = data,
    x = 'Income',
    y = 'Total_Expenditure',
    color = 'labels2',
    opacity = 1,
    orientation = 'v',
    labels = {'labels3': 'Labels', 'Num_of_Transactions': 'Number of Transactions', 
              'Total_Expenditure':'Total Expenditure'},
    title='Relationship of Transaction and Expenditure',
    width= 900, height= 500,
    template= 'plotly_dark'
)
scatter.update_layout(legend = {'x':10, 'y':1.0},font=dict(size=9,color="white"))
pio.show(scatter)

Cluster 0: Higher income and spending: top tier customers.

Cluster 1: Lower income and lower spending: less profitable customers.

In [None]:
barchart = px.bar(
    data_frame = data,
    x = 'Members',
    y = 'Total_Expenditure',
    color = 'labels2',
    barmode ='group',
    title='Relationship of Education and Total_Expenditure',
    width= 900, height= 500,
    template= 'plotly_dark',
    hover_name='Age', 
)
    
barchart.update_traces(marker_line_width = 0,
                  selector=dict(type="bar"))

pio.show(barchart)

Cluster 0: Couple and single are the most revenue generating customers and they share the bigger portion of this cluster.

Cluster 1: Have a combination of with 3 member's family at peak. Express less revenue

In [None]:
import plotly.express as px
import plotly.io as pio

scatter = px.scatter(
    data_frame = data,
    x = 'Enrollment_Duration',
    y = 'Total_Expenditure',
    color = 'labels2',
    opacity = 1,
    orientation = 'v',
    labels = {'labels3': 'Labels', 'Num_of_Transactions': 'Number of Transactions', 
              'Total_Expenditure':'Total Expenditure'},
    title='Relationship of Transaction and Expenditure',
    width= 900, height= 500,
    template= 'plotly_dark'
)
scatter.update_layout(legend = {'x':10, 'y':1.0},font=dict(size=9,color="white"))
pio.show(scatter)

### PROFILING THE CLUSTERS

As a result of cluster analysis, we received three groups of buyers (clusters):

Cluster label 0:

High income people (average income equals approximately 69000)

Average age is 46 years

Most of the customers who are graduate only belongs to here.

Have a family with offsprings but single pe ople and couples are concentrated in this cluster.

Purchase frequency is higher. Purchase volume is also higher. Responds campaining programs. Average days take for each transaction 18 days.

Cluster label 1:

Average income equals 37000 which is quite lower than cluster 0.

Average age is 42 years.

Have an education (Graduation, Masters, PhD)

Purchase frequency is lower. Average expenditure is quite lower than cluster 0. Average days take for one purchase is 42 around . These group of customers do not purchase so frequently. They are not loyal or profitable for company.

In [None]:
data.head(5)

### MODEL BUILDING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer
import pickle

In [None]:
data['labels2'] = data['labels2'].astype(int)
X = data.drop(columns=["ID", "labels2"])
y = data["labels2"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [None]:
best_model = None
best_score = 0
# Initialize cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    # Cross-validation scores (returns accuracy scores for each fold)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    
    # Calculate average score across folds
    mean_score = np.mean(cv_scores)
    print(f"Model: {name}")
    print(f"Cross-validated Accuracy: {mean_score:.2f}")

    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    print(confusion_matrix(y_test, y_pred))
    print()

#### Using Random Forest For Training

In [None]:
RFmodel = RandomForestClassifier()

In [None]:
RFmodel.fit(X_train, y_train)
with open("best_model.pkl", "wb") as f:
    pickle.dump(RFmodel, f)

In [None]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [None]:
with open("best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
new_data = pd.DataFrame({
    "Age": [25],
    "Education": [2],
    "Members": [2],
    "Income": [56000],
    "Last_Purchase": [300],
    "Enrollment_Duration": [30],
    "Campaign_Acceptance": [1],
    "Average_Days": [20.0],
    "Average_Expenditure": [15.0],
    "Num_of_Transactions": [15],
    "Total_Expenditure": [350],
    "Complain": [0]
})

new_data = scaler.transform(new_data)
new_prediction = loaded_model.predict(new_data)
print("Prediction for new data:", new_prediction)



if(new_prediction == 0):
    print("High income people (average income equals approximately 69000)\nAverage age is 46 years\nMost of the customers who are graduate only belongs to here.\nHave a family with offsprings but single pe ople and couples are concentrated in this cluster.\nPurchase frequency is higher.\nPurchase volume is also higher.\nResponds campaining programs.\nAverage days take for each transaction 18 days.")
else :
    print("Average income equals 37000 which is quite lower than cluster 0.\nAverage age is 42 years.\nHave an education (Graduation, Masters, PhD)\nPurchase frequency is lower. Average expenditure is quite lower than cluster 0. Average days take for one purchase is 42 around . These group of customers do not purchase so frequently. They are not loyal or profitable for company.")