In [1]:
import pandas as pd
df = pd.read_csv('dummy_revenue_forecasting_data.csv')
print(df.head())
print(df.info())
print(df.isnull().sum())


         date product_category region agent_id marketing_spend  lead_count  \
0  29-03-2025         Cosmetic  North       A4           21000       170.0   
1  29-03-2025         Logistic  South       A5           14000        60.0   
2  29-03-2025          Fintech   West       A5           31000        70.0   
3  30-03-2025         Cosmetic  North       A3           47000        70.0   
4  30-03-2025             FMCG   West       A9           19000        80.0   

  revenue  
0   75000  
1   43000  
2   89000  
3   86000  
4   74000  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94983 entries, 0 to 94982
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              94983 non-null  object 
 1   product_category  94983 non-null  object 
 2   region            94983 non-null  object 
 3   agent_id          92333 non-null  object 
 4   marketing_spend   94982 non-null  object 
 5   lead_count       

In [2]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['date']).copy()
df.loc[:, 'day'] = df['date'].dt.day
df.loc[:, 'month'] = df['date'].dt.month
df.loc[:, 'weekday'] = df['date'].dt.weekday

df = df.dropna()

df = pd.get_dummies(df, columns=['product_category', 'region', 'agent_id'], drop_first=True)

X = df.drop(['date', 'revenue'], axis=1)
y = df['revenue']

print("Data ready!")
print("X shape:", X.shape)
print("y shape:", y.shape)


Data ready!
X shape: (88309, 23)
y shape: (88309,)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

valid_indices = ~(X.isna().any(axis=1) | y.isna())
X = X[valid_indices]
y = y[valid_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model Training Complete!")
print("Mean Absolute Error (MAE):", round(mae, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))

Model Training Complete!
Mean Absolute Error (MAE): 15226.84
Root Mean Squared Error (RMSE): 17751.13


In [4]:
import joblib

joblib.dump(model, 'model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')


['model_columns.pkl']

In [None]:
import zipfile

with zipfile.ZipFile("model_files.zip", "w") as z:
    z.write("model.pkl")
    z.write("model_columns.pkl")

files.download("model_files.zip")
