In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

num_samples = 5000000  

turnover = 5000

average_price = np.random.uniform(500, 3000, num_samples)  
advertising_expenditure = np.random.uniform(1000000000, 4000000000, num_samples)  
website_traffic = np.random.randint(2000, 10000, num_samples) 
customer_retention = np.random.uniform(0.5, 1, num_samples) 
average_order_value = np.random.uniform(1000, 6000, num_samples)  
seasonality = np.random.uniform(0.7, 0.9, num_samples) 
regular_customer_value = np.random.uniform(20000, 100000, num_samples)

months = np.arange(1, num_samples + 1)
seasonal_demand_factor = 0.5 * np.sin(2 * np.pi * months / 12) + 1

discount_offer = np.random.choice([0, 1], size=num_samples, p=[0.8, 0.2])

net_revenue = (
    (average_price * average_order_value * website_traffic * (customer_retention / 100)) -
    (advertising_expenditure * 0.2) +
    (regular_customer_value * 0.1) +
    (seasonality * 1000)
) * turnover

data = pd.DataFrame({
    'Average_Price': average_price,
    'Advertising_Expenditure': advertising_expenditure,
    'Website_Traffic': website_traffic,
    'Customer_Retention': customer_retention,
    'Average_Order_Value': average_order_value,
    'Seasonality': seasonality,
    'Regular_Customer_Value': regular_customer_value,
    'Seasonal_Demand_Factor': seasonal_demand_factor,
    'Discount_Offer': discount_offer,
    'Total_Revenue': total_revenue,
    'Net_Revenue': net_revenue  
})

data.to_csv('ecommerce_sales_data.csv', index=False)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

data = pd.read_csv('ecommerce_sales_data.csv')

features = data[['Average_Price', 'Advertising_Expenditure', 'Website_Traffic', 
                 'Customer_Retention', 'Average_Order_Value', 'Seasonality', 
                 'Regular_Customer_Value', 'Seasonal_Demand_Factor', 'Discount_Offer']]
target = data['Net_Revenue']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = make_pipeline(
    PolynomialFeatures(degree=3, interaction_only=True, include_bias=False),
    StandardScaler(),
    LinearRegression()
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

R-squared: 0.9997489294232896
Mean Absolute Error: 9785030993.04751
Root Mean Squared Error: 17384961525.844482




In [1]:
import pickle
import pandas as pd

c

input_data = pd.read_csv('input_data.csv')

predicted_net_revenue = loaded_model.predict(input_data)

input_data['Predicted_Net_Revenue'] = predicted_net_revenue

print(input_data)

NameError: name 'model' is not defined