In [1]:
import pandas as pd
data=pd.read_csv('Cluster_Customer.csv')

# Prepare the data

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# Create a copy of the data for preparation
df = data.copy()

# 1. Convert date and time column
# Change 'Order_Date' to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')

# Extract features from 'Order_Date'
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month
df['Day'] = df['Order Date'].dt.day
df['Weekday'] = df['Order Date'].dt.weekday

# 2. Normalize numerical columns
scaler = MinMaxScaler()
numerical_columns = ['Quantity Ordered', 'Price Each', 'Sales', 'Hour']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# 3. Encode categorical columns
# Label encoding for binary categorical columns
binary_columns = ['Is Expensive', 'Holiday Season', 'Repeat Customer']
for col in binary_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Apply One-Hot Encoding to 'Product_Category' and 'Region'
df = pd.get_dummies(df, columns=['Product Category', 'Region', 'City'], drop_first=True)

# 4. Handle missing values (if any)
df.fillna(0, inplace=True)

# 5. Split the data into training and testing sets
X = df.drop(columns=['Sales', 'Order Date', 'Order ID', 'Purchase Address'])  # loại bỏ các cột không cần thiết
y = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the normalization pipeline for future use
import joblib
joblib.dump(scaler, 'numerical_scaler.pkl')


['numerical_scaler.pkl']

# 1. Forecast future revenue or order quantity (LSTM)

In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing
data['Sales'] = scaler.fit_transform(data[['Sales']])

# Create time samples for LSTM
def create_sequences(data, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 10
X, y = create_sequences(data['Sales'].values, seq_length)

# Reshape for LSTM
X = X.reshape((X.shape[0], X.shape[1], 1))

# Build the LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
    Dropout(0.2),
    LSTM(25),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
model.fit(X, y, epochs=30, batch_size=32, callbacks=[early_stop])

# Save the model and normalization pipeline
model.save('sales_forecast_lstm_model.keras')
joblib.dump(scaler, 'sales_scaler.pkl')


Epoch 1/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 16ms/step - loss: 0.0096
Epoch 2/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 16ms/step - loss: 0.0097
Epoch 3/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 16ms/step - loss: 0.0096
Epoch 4/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 16ms/step - loss: 0.0095
Epoch 5/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 23ms/step - loss: 0.0095
Epoch 6/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 16ms/step - loss: 0.0097
Epoch 7/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 15ms/step - loss: 0.0096
Epoch 8/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 16ms/step - loss: 0.0096
Epoch 9/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 16ms/step - loss: 0.0096
Epoch 10/30
[1m5811/5811[0m [32m━━━━━━━━━━━━━━━━━━

['sales_scaler.pkl']

# 2. Customer segmentation and product recommendation (Autoencoder)

In [4]:
import pandas as pd
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import joblib

# Assume you have read the data into a DataFrame df
df=pd.read_csv('Cluster_Customer.csv')

# Select numerical columns for normalization
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df_numeric = df[numeric_columns]

# Data preprocessing
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Create a DataFrame from the normalized data
df_scaled = pd.DataFrame(df_scaled, columns=numeric_columns)

# Build the Autoencoder model
input_dim = df_scaled.shape[1]

input_layer = Input(shape=(input_dim,))
encoder = Dense(16, activation='relu')(input_layer)  # giảm số lượng node
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

# Set up EarlyStopping
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model
autoencoder.fit(df_scaled, df_scaled, epochs=50, batch_size=64, callbacks=[early_stop])

# Save the model and normalization pipeline
autoencoder.save('customer_segmentation_autoencoder.keras')
joblib.dump(scaler, 'customer_scaler.pkl')


Epoch 1/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0374
Epoch 2/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0013
Epoch 3/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 7.9163e-04
Epoch 4/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 6.5107e-04
Epoch 5/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 5.8860e-04
Epoch 6/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 5.6230e-04
Epoch 7/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 5.5363e-04
Epoch 8/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 5.5092e-04
Epoch 9/50
[1m2906/2906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 5.5025e-04
Epoch 10/50
[1m2906/2906[0m [32m━━━━━━━━━━

['customer_scaler.pkl']

# 3. Classify potential customers for marketing campaigns (CNN)

In [5]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Conv1D, Flatten, Dropout, Dense, Input
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

# Select input columns
X = data['Product']  # Use the Product column
y = (data['Is Expensive'] == 'Yes').astype(int)  # Assume Is_Expensive is 'Yes' or 'No'

# Label encoding for the 'Product' column
label_encoder = LabelEncoder()
X_encoded = label_encoder.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_encoded.reshape(-1, 1), y, test_size=0.2, random_state=42)

# Transform data for CNN
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build the CNN model
model = Sequential([
    Input(shape=(X_train.shape[1], 1)),  # Add Input layer
    Conv1D(16, 1, activation='relu'),  # Change the kernel size to 1
    Dropout(0.1),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=64)

# Save the model
model.save('marketing_campaign_cnn_model.keras')


Epoch 1/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - accuracy: 0.9745 - loss: 0.1469
Epoch 2/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9746 - loss: 0.0936
Epoch 3/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9759 - loss: 0.0659
Epoch 4/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9959 - loss: 0.0235
Epoch 5/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9985 - loss: 0.0121
Epoch 6/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0100
Epoch 7/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0094
Epoch 8/20
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9988 - loss: 0.0078
Epoch 9/20
[1m2325/

# 4. Sentiment analysis and purchase behavior (CNN + LSTM)

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Data
X = data['Product'].values
y = data['Is Expensive'].values

# Convert labels y to numerical format (if necessary)
le = LabelEncoder()
y = le.fit_transform(y)

# Convert text to integer format
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad the data
X_sequences = pad_sequences(X_sequences, maxlen=50)
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=50),
    SpatialDropout1D(0.2),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64)

# Save the model
model.save('customer_behavior_lstm_model.keras')


Epoch 1/5
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 39ms/step - accuracy: 0.9900 - loss: 0.0374
Epoch 2/5
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 38ms/step - accuracy: 0.9994 - loss: 0.0048
Epoch 3/5
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 41ms/step - accuracy: 0.9995 - loss: 0.0037
Epoch 4/5
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 40ms/step - accuracy: 0.9994 - loss: 0.0049
Epoch 5/5
[1m2325/2325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 39ms/step - accuracy: 0.9994 - loss: 0.0041
