In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Load the dataset
dataset = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual dataset filename

# Feature selection - Assuming 'Quantity' and 'TotalAmount' are relevant features
features = ['Quantity', 'TotalAmount']

# Create a target variable ('Revenue') based on Quantity * TotalAmount
dataset['Revenue'] = dataset['Quantity'] * dataset['TotalAmount']

# Initialize the models
models = {
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression()
}

# Dropdown widget for selecting a customer
customer_dropdown = widgets.Dropdown(
    options=sorted(dataset['CustomerID'].unique()),
    value=dataset['CustomerID'].unique()[0],
    description='Select Customer:',
)

# Function to train, predict, and evaluate multiple models
def train_predict_evaluate_models(X, y):
    results = {}
    for name, model in models.items():
        model.fit(X, y)
        predictions = model.predict(X)
        mse = mean_squared_error(y, predictions)
        results[name] = {
            'model': model,
            'mse': mse
        }
    return results

# Function to update and display the plot based on the selected customer
def update_plot(selected_customer):
    plt.clf()  # Clear previous plot

    # Filter data for the specific customer
    customer_data = dataset[dataset['CustomerID'] == selected_customer]

    # Use only relevant features
    X = customer_data[features]
    y = customer_data['Revenue']

    # Train, predict, and evaluate models
    results = train_predict_evaluate_models(X, y)

    # Visualize predictions for the selected customer from each model
    for model_name, result in results.items():
        model = result['model']
        new_data = pd.DataFrame({
            'Quantity': customer_data['Quantity'].values,
            'TotalAmount': customer_data['TotalAmount'].values,
        })
        predictions = model.predict(new_data)
        plt.plot(predictions, marker='o', label=f'{model_name} - MSE: {result["mse"]:.2f}')

    plt.title(f'Predicted Revenue for Customer {selected_customer} in the Next 6 Months')
    plt.xlabel('Month')
    plt.ylabel('Revenue')
    plt.legend()
    plt.show()
    return results

# Connect the dropdown to the update function
results = widgets.interactive(update_plot, selected_customer=customer_dropdown)
display(results)


interactive(children=(Dropdown(description='Select Customer:', options=(1, 2, 3, 4, 5), value=1), Output()), _…

<Figure size 800x550 with 0 Axes>

<Figure size 800x550 with 0 Axes>

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Load the dataset
dataset = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual dataset filename

# Feature selection - Assuming 'Quantity' and 'TotalAmount' are relevant features
features = ['Quantity', 'TotalAmount']

# Create a target variable ('Revenue') based on Quantity * TotalAmount
dataset['Revenue'] = dataset['Quantity'] * dataset['TotalAmount']

# Initialize the models
models = {
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression()
}

# Dropdown widget for selecting a customer
customer_dropdown = widgets.Dropdown(
    options=sorted(dataset['CustomerID'].unique()),
    value=dataset['CustomerID'].unique()[0],
    description='Select Customer:',
)

# Function to train, predict, and evaluate multiple models
def train_predict_evaluate_models(X, y):
    results = {}
    for name, model in models.items():
        model.fit(X, y)
        predictions = model.predict(X)
        mse = mean_squared_error(y, predictions)
        results[name] = {
            'model': model,
            'mse': mse
        }
    return results

# Function to update and display the plot based on the selected customer
def update_plot(selected_customer):
    plt.clf()  # Clear previous plot

    # Filter data for the specific customer
    customer_data = dataset[dataset['CustomerID'] == selected_customer]

    # Use only relevant features
    X = customer_data[features]
    y = customer_data['Revenue']

    # Train, predict, and evaluate models
    results = train_predict_evaluate_models(X, y)

    # Find the best model based on MSE
    best_model_name = min(results, key=lambda k: results[k]['mse'])
    best_model = results[best_model_name]['model']

    # Visualize predictions for the selected customer using the best model
    new_data = pd.DataFrame({
        'Quantity': customer_data['Quantity'].values,
        'TotalAmount': customer_data['TotalAmount'].values,
    })
    predictions = best_model.predict(new_data)
    plt.plot(predictions, marker='o', label=f'{best_model_name} - MSE: {results[best_model_name]["mse"]:.2f}')

    plt.title(f'Predicted Revenue for Customer {selected_customer} in the Next 6 Months')
    plt.xlabel('Month')
    plt.ylabel('Revenue')
    plt.legend()
    plt.show()
    return best_model

# Connect the dropdown to the update function
best_model = widgets.interactive(update_plot, selected_customer=customer_dropdown)
display(best_model)


interactive(children=(Dropdown(description='Select Customer:', options=(1, 2, 3, 4, 5), value=1), Output()), _…

In [3]:
import pandas as pd
from pycaret.regression import *
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Load the dataset
url = 'your_dataset.csv'  # Replace with your dataset URL
dataset = pd.read_csv(url)

# Feature selection - Assuming 'Quantity' and 'TotalAmount' are relevant features
features = ['Quantity', 'TotalAmount']  # Include CustomerID for filtering

# Create a target variable ('Revenue') based on Quantity * TotalAmount
dataset['Revenue'] = dataset['Quantity'] * dataset['TotalAmount']

# Initialize PyCaret setup for regression
reg_setup = setup(data=dataset, target='Revenue', ignore_features=['PurchaseID', 'PurchaseDate'])

# Compare all regression models
best_model = compare_models(fold=5, sort='RMSE')  # Change the sorting metric if needed

# Dropdown widget for selecting a customer
customer_dropdown = widgets.Dropdown(
    options=sorted(dataset['CustomerID'].unique()),
    value=dataset['CustomerID'].unique()[0],
    description='Select Customer:'
)

# Function to update and display the plot based on the selected customer
def update_plot(selected_customer):
    plt.clf()  # Clear previous plot

    # Filter data for the specific customer
    customer_data = dataset[dataset['CustomerID'] == selected_customer]

    # Use only relevant features
    new_data = customer_data[features]

    # Make predictions for the next 6 months for the selected customer
    predictions = predict_model(best_model, data=new_data)

    # Create a range of months (assuming 6 months) for x-axis
    months = range(1, 7)  # Adjust as needed for the number of predicted months

    # Visualize predictions for the selected customer
    plt.plot(months, predictions, marker='o', label=f'Customer {selected_customer}')

    plt.title(f'Predicted Revenue for Customer {selected_customer} in the Next 6 Months')
    plt.xlabel('Month')
    plt.ylabel('Revenue')
    plt.legend()
    plt.show()

# Connect the dropdown to the update function
widgets.interactive(update_plot, selected_customer=customer_dropdown)


Unnamed: 0,Description,Value
0,Session id,2236
1,Target,Revenue
2,Target type,Regression
3,Original data shape,"(50, 11)"
4,Transformed data shape,"(50, 26)"
5,Transformed train set shape,"(35, 26)"
6,Transformed test set shape,"(15, 26)"
7,Ignore features,2
8,Numeric features,4
9,Categorical features,4


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,92.0386,130435.9157,203.7372,-0.4459,0.7069,0.5443,0.062
par,Passive Aggressive Regressor,108.9668,182043.938,237.7266,-1.6469,0.5388,0.5959,0.046
knn,K Neighbors Regressor,137.7429,422686.3714,329.8277,-1.2774,0.6853,0.5325,0.07
lr,Linear Regression,210.4541,367937.687,379.3044,-48.486,1.4425,3.462,1.808
lar,Least Angle Regression,202.4801,394111.0325,380.3876,-48.2931,1.3692,3.1706,0.052
ridge,Ridge Regression,214.1923,384228.8207,386.4197,-48.4302,1.4625,3.511,0.052
et,Extra Trees Regressor,208.1129,438768.7608,389.6402,-37.7234,1.4217,2.9698,0.096
br,Bayesian Ridge,218.0945,401521.0659,393.7727,-48.5927,1.4849,3.5627,0.046
ada,AdaBoost Regressor,213.0233,452422.1646,399.8612,-43.4398,1.4686,3.1541,0.074
en,Elastic Net,222.9135,421278.0219,402.3255,-49.0381,1.517,3.6393,0.048


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

interactive(children=(Dropdown(description='Select Customer:', options=(1, 2, 3, 4, 5), value=1), Output()), _…

In [4]:
import pandas as pd
from pycaret.regression import *

import matplotlib.pyplot as plt
import ipywidgets as widgets

# Load dataset
dataset = pd.read_csv('your_dataset.csv')

# Select features  
features = ['CustomerID', 'Quantity', 'TotalAmount']

# Create target variable 
dataset['Revenue'] = dataset['Quantity'] * dataset['TotalAmount']  

# Set up regression model
reg_setup = setup(data = dataset, 
                  target = 'Revenue',
                  session_id = 123)

# Get best model
best_model = compare_models()

# Widget for customer selection
customer_dropdown = widgets.Dropdown(
    options = dataset['CustomerID'].unique(),
    description = 'Select Customer ID:'  
)

# Function to plot predictions
def plot_predictions(customer_id):
    
    # Filter data 
    data = dataset[dataset['CustomerID'] == customer_id]
    data = data[features]
    
    # Get predictions 
    predictions = predict_model(best_model, data=data) 
    
    # Plot 
    plt.plot(range(1,len(data)+len(predictions)+1), 
             list(data['Revenue']) + list(predictions['Label']),
             marker='o')

    plt.title(f'Predictions for Customer {customer_id}')
    plt.xlabel('Time Period')
    plt.ylabel('Revenue')
    plt.show()
    
# Connect widget 
widgets.interactive(plot_predictions, customer_id=customer_dropdown)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Revenue
2,Target type,Regression
3,Original data shape,"(50, 11)"
4,Transformed data shape,"(50, 28)"
5,Transformed train set shape,"(35, 28)"
6,Transformed test set shape,"(15, 28)"
7,Numeric features,5
8,Categorical features,5
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,42.4667,15409.6,62.8719,0.1923,0.5041,0.4112,0.05
par,Passive Aggressive Regressor,44.2727,5009.5534,54.2777,-1.2093,0.6091,0.82,0.04
et,Extra Trees Regressor,70.0446,20220.3058,90.0984,-1.5537,0.9777,1.5809,0.096
ada,AdaBoost Regressor,72.9681,20663.9468,92.6653,-1.7159,1.0039,1.6232,0.059
rf,Random Forest Regressor,68.3554,19940.1224,89.0806,-1.7247,0.9871,1.5834,0.092
gbr,Gradient Boosting Regressor,69.2154,20563.2378,90.4901,-1.7786,0.9993,1.579,0.052
huber,Huber Regressor,71.4658,20128.6114,91.8082,-1.7971,0.9879,1.5722,0.05
en,Elastic Net,72.0036,20204.4211,92.3924,-1.8589,0.994,1.5891,0.038
ridge,Ridge Regression,72.6475,20574.9395,93.237,-1.9103,1.0037,1.6069,0.04
lightgbm,Light Gradient Boosting Machine,72.9995,21030.7789,94.1607,-1.9608,1.015,1.621,0.055


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

interactive(children=(Dropdown(description='Select Customer ID:', options=(1, 2, 3, 4, 5), value=1), Output())…

In [5]:
import pandas as pd
from pycaret.regression import *
import matplotlib.pyplot as plt
import numpy as np

# Load sample data 
data = pd.DataFrame({
    'CustomerID': [1, 1, 1, 2, 2, 2],
    'Date': [
        '2023-01-01', '2023-02-01', '2023-03-01', 
        '2023-01-01', '2023-02-01', '2023-03-01'], 
    'Revenue': [100, 150, 200, 50, 75, 100]
})

# Convert Date to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Setup regression  
reg_setup = setup(data, 
                  target = 'Revenue', 
                  session_id=123) 

# Best model
best_model = compare_models()

# Predict next 3 months revenue for each customer 
new_data = data.drop_duplicates(subset=['CustomerID']).tail(2)
new_data['Date'] = pd.date_range(start='2023-04-01', periods=3, freq='MS') 

predictions = predict_model(best_model, data=new_data)

# Actual vs predicted plot
customers = data['CustomerID'].unique()
for customer in customers:
    original_columns = dataset.columns
    data = dataset[dataset['CustomerID'] == customer_id][original_columns]
    
    cust_predictions = predictions[predictions['CustomerID'] == customer]
    
    plt.plot(cust_data['Date'], cust_data['Revenue'], marker='o', label='Actual')
    plt.plot(cust_predictions['Date'], cust_predictions['Prediction'], marker='x', label='Predicted')
    plt.title(f'Revenue Prediction for Customer {customer}')
    plt.legend()
    plt.xlabel('Date')
    plt.ylabel('Revenue')
    plt.show()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Revenue
2,Target type,Regression
3,Original data shape,"(6, 3)"
4,Transformed data shape,"(6, 5)"
5,Transformed train set shape,"(4, 5)"
6,Transformed test set shape,"(2, 5)"
7,Numeric features,1
8,Date features,1
9,Preprocess,True


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

ValueError: Length of values (3) does not match length of index (2)

In [5]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker to create fake data
fake = Faker()

# Open a CSV file in write mode
with open('customer_purchase_data.csv', mode='w', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow([
        'CustomerID', 'CustomerName', 'CustomerEmail',
        'ProductID', 'ProductName', 'Category',
        'PurchaseID', 'PurchaseDate', 'Quantity', 'TotalAmount'
    ])
    
    # Generate 100,000 rows of data
    for _ in range(500000):
        customer_id = fake.random_int(1000, 9999)
        customer_name = fake.name()
        customer_email = fake.email()
        product_id = fake.random_int(1, 500)
        product_name = fake.word()
        category = fake.random_element(['Electronics', 'Clothing', 'Home', 'Books', 'Beauty'])
        purchase_id = fake.uuid4()
        purchase_date = fake.date_time_between_dates(datetime(2017, 1, 1), datetime(2023, 12, 31)).strftime('%Y-%m-%d %H:%M:%S')
        quantity = fake.random_int(1, 10)
        total_amount = round(random.uniform(10, 500), 2)
        
        # Write the row of data to the CSV file
        writer.writerow([
            customer_id, customer_name, customer_email,
            product_id, product_name, category,
            purchase_id, purchase_date, quantity, total_amount
        ])
        
print("CSV file created successfully!")


CSV file created successfully!


In [6]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker to create fake data
fake = Faker()

# Define expanded products for each category
categories = {
    'Electronics': ['Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Smartwatch', 'Camera', 'Gaming Console'],
    'Clothing': ['Shirt', 'Dress', 'Jeans', 'Sweater', 'Jacket', 'Skirt', 'Trousers'],
    'Home': ['Chair', 'Table', 'Lamp', 'Curtains', 'Rug', 'Sofa', 'Bed'],
    'Books': ['Fiction', 'Non-fiction', 'Thriller', 'Self-help', 'Biography', 'Science Fiction', 'Mystery'],
    'Beauty': ['Skincare', 'Makeup', 'Haircare', 'Fragrance', 'Tools', 'Nail Care', 'Bath & Body']
}

data = []

# Generate 100,000 rows of data
for _ in range(100000):
    customer_id = fake.random_int(1000, 9999)
    customer_name = fake.first_name() + ' ' + fake.last_name()
    customer_email = fake.email().replace('@', f'{customer_name.replace(" ", "").lower()}@')
    
    for _ in range(random.randint(1, 5)):
        category = fake.random_element(categories.keys())
        product_name = fake.random_element(categories[category])
        
        product_id = fake.random_int(1, 500)
        purchase_id = fake.uuid4()
        purchase_date = fake.date_time_between_dates(datetime(2017, 1, 1), datetime(2023, 12, 31)).strftime('%Y-%m-%d %H:%M:%S')
        quantity = fake.random_int(1, 10)
        total_amount = round(random.uniform(10, 500), 2)
        
        data.append([
            customer_id, customer_name, customer_email,
            product_id, product_name, category,
            purchase_id, purchase_date, quantity, total_amount
        ])

# Shuffle the data
random.shuffle(data)

# Open a CSV file in write mode
with open('customer_purchase_data_1.csv', mode='w', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow([
        'CustomerID', 'CustomerName', 'CustomerEmail',
        'ProductID', 'ProductName', 'Category',
        'PurchaseID', 'PurchaseDate', 'Quantity', 'TotalAmount'
    ])
    
    # Write the shuffled data to the CSV file
    for row in data:
        writer.writerow(row)

print("CSV file created successfully!")


CSV file created successfully!
