In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [18]:
# Initialize the models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression()
}

In [2]:
import pickle
import pandas as pd

# Load the datasets from the saved file
with open('train_test_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)

# Convert 'InvoiceDate' to datetime if it is not already
X_train['InvoiceDate'] = pd.to_datetime(X_train['InvoiceDate'], errors='coerce')
X_test['InvoiceDate'] = pd.to_datetime(X_test['InvoiceDate'], errors='coerce')

# Extract year, month, day from the InvoiceDate before converting to timestamp
X_train['InvoiceYear'] = X_train['InvoiceDate'].dt.year
X_train['InvoiceMonth'] = X_train['InvoiceDate'].dt.month
X_train['InvoiceDay'] = X_train['InvoiceDate'].dt.day

X_test['InvoiceYear'] = X_test['InvoiceDate'].dt.year
X_test['InvoiceMonth'] = X_test['InvoiceDate'].dt.month
X_test['InvoiceDay'] = X_test['InvoiceDate'].dt.day

# After extracting the date features, you can drop the 'InvoiceDate' column if you no longer need it
X_train = X_train.drop(columns=['InvoiceDate'])
X_test = X_test.drop(columns=['InvoiceDate'])

# Verify data types after conversion
print("X_train data types after conversion:")
print(X_train.dtypes)

# Check for NaN values
print("Missing values in features:")
print(X_train.isnull().sum())  # Check for missing values in features
print("Missing values in target:")
print(y_train.isnull().sum())  # Check for missing values in target

X_train data types after conversion:
CustomerID           int64
Quantity           float64
UnitPrice          float64
StockCode_10080       bool
StockCode_10120       bool
                    ...   
StockCode_PADS        bool
StockCode_POST        bool
InvoiceYear          int32
InvoiceMonth         int32
InvoiceDay           int32
Length: 3205, dtype: object
Missing values in features:
CustomerID         0
Quantity           0
UnitPrice          0
StockCode_10080    0
StockCode_10120    0
                  ..
StockCode_PADS     0
StockCode_POST     0
InvoiceYear        0
InvoiceMonth       0
InvoiceDay         0
Length: 3205, dtype: int64
Missing values in target:
0


In [3]:
# Define models
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and evaluate models
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)

    # Print evaluation metrics
    print(f"{model_name} - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}\n")

Training Random Forest...
Random Forest - MAE: 437.7381, MSE: 201864.3919, RMSE: 449.2932, R^2: 0.0008

Training Gradient Boosting...
Gradient Boosting - MAE: 438.3029, MSE: 201833.3010, RMSE: 449.2586, R^2: 0.0009



In [4]:
print(y_train.describe())  # Check the distribution of target values


count    105166.000000
mean        453.159253
std         449.118386
min           0.000000
25%          65.000000
50%         148.000000
75%         999.000000
max         999.000000
Name: Target, dtype: float64
