In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [None]:
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
for filename in uploaded.keys():
    print(f'Uploaded file: {filename}')

In [None]:
# Read the Excel file into a DataFrame
data = pd.read_excel('marketing_campaign.xlsx')

In [None]:
data

In [None]:
# Display the DataFrame
print(data.head())

In [None]:
# Print the last 5 rows of the DataFrame
print(data.tail())

In [None]:
# Columns of the Dataset
print(data.columns)

In [None]:
# Data types present in the DataSet
print(data.dtypes)

In [None]:
# Display basic information about the dataset
print(data.info())

In [None]:
# Display summary statistics
print(data.describe())

In [None]:
# Display summary statistics
statistics = data.describe()
statistics.loc['mode'] = data.mode().iloc[0]
print(statistics)

In [None]:
# Handle missing values in 'Income'
data['Income'].fillna(data['Income'].median(), inplace=True)

In [None]:
# Convert 'Dt_Customer' to datetime format
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])

In [None]:
# Assuming the data from multiple sources have been merged into `data`.
# Display a preview of the data
data.head()

In [None]:
# Feature segmentation for RFM analysis
# Assume today is the last date in the dataset
today = dt.datetime(2024, 7, 3)

In [None]:
# Calculate Recency
data['Recency'] = (today - data['Dt_Customer']).dt.days

In [None]:
# Calculate Frequency
data['Frequency'] = data[['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)

In [None]:
# Calculate Monetary
data['Monetary'] = data[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)

In [None]:
# Display the first few rows with the new RFM columns
data[['ID', 'Recency', 'Frequency', 'Monetary']].head()

In [None]:
# Standardize the RFM values
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(data[['Recency', 'Frequency', 'Monetary']])

In [None]:
# Encode categorical features if necessary (example for 'Education')
data = pd.get_dummies(data, columns=['Education', 'Marital_Status'], drop_first=True)

In [None]:
# Exploratory Data Analysis
plt.figure(figsize=(10, 6))
sns.histplot(data['Recency'], bins=30)
plt.title('Distribution of Recency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['Frequency'], bins=30)
plt.title('Distribution of Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['Monetary'], bins=30)
plt.title('Distribution of Monetary')
plt.show()

In [None]:
# Pairplot for RFM
sns.pairplot(data[['Recency', 'Frequency', 'Monetary']])
plt.show()

In [None]:
# Correlational Analysis
corr = data[['Recency', 'Frequency', 'Monetary']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Determine the optimal number of clusters using the Elbow method
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    sse.append(kmeans.inertia_)

In [None]:
# Plot the SSE against the number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow Method')
plt.show()

In [None]:
# Fit the K-means model with the optimal number of clusters (let's assume 4 clusters from the elbow method)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(rfm_scaled)

In [None]:
# Display the first few rows with cluster labels
data[['ID', 'Recency', 'Frequency', 'Monetary', 'Cluster']].head()

In [None]:
# Analyze the clusters
cluster_summary = data.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'ID': 'count'
}).rename(columns={'ID': 'CustomerCount'}).reset_index()

print(cluster_summary)

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', data=data, palette='Set1')
plt.title('Customer Segments by Recency and Monetary')
plt.show()

In [None]:
# Calculate RFM scores
data['R_Score'] = pd.qcut(data['Recency'], 4, labels=[4, 3, 2, 1])
data['F_Score'] = pd.qcut(data['Frequency'].rank(method='first'), 4, labels=[1, 2, 3, 4])
data['M_Score'] = pd.qcut(data['Monetary'], 4, labels=[1, 2, 3, 4])

In [None]:
# Combine RFM scores
data['RFM_Score'] = data['R_Score'].astype(str) + data['F_Score'].astype(str) + data['M_Score'].astype(str)

In [None]:
# Display the first few rows with RFM scores
data[['ID', 'Recency', 'Frequency', 'Monetary', 'RFM_Score']].head()

In [None]:
# Analyze the RFM segments
rfm_summary = data.groupby('RFM_Score').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'ID': 'count'
}).rename(columns={'ID': 'CustomerCount'}).reset_index()

print(rfm_summary)

In [None]:
# Visualize RFM segments
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Frequency', y='Monetary', hue='RFM_Score', data=data, palette='tab20', legend=None)
plt.title('RFM Segments by Frequency and Monetary')
plt.show()

In [None]:
# Standardize the RFM values
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(data[['Recency', 'Frequency', 'Monetary']])

In [None]:
# Build and train a K-means clustering model
from sklearn.cluster import KMeans

In [None]:
# Determine the optimal number of clusters using the Elbow method
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    sse.append(kmeans.inertia_)

In [None]:
# Fit the K-means model with the optimal number of clusters (let's assume 4 clusters from the elbow method)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(rfm_scaled)

In [None]:
# Analyze the clusters
cluster_summary = data.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'ID': 'count'
}).rename(columns={'ID': 'CustomerCount'}).reset_index()

In [None]:
print(cluster_summary)

In [None]:
pip install flask

In [None]:
"from flask import Flask, request, jsonify
import pickle
import numpy as np

app = Flask(__name__)

# Load the trained model
with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)

# Load the scaler
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    rfm = np.array([data['Recency'], data['Frequency'], data['Monetary']]).reshape(1, -1)
    rfm_scaled = scaler.transform(rfm)
    cluster = kmeans.predict(rfm_scaled)
    return jsonify({'Cluster': int(cluster[0])})

if __name__ == '__main__':
    app.run(debug=True)"


In [None]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Assuming kmeans and scaler are already trained
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
pyinstaller --onefile --add-data "kmeans_model.pkl:." --add-data "scaler.pkl:." app.py