In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
# Load Data
order_list = pd.read_csv('preprocessed_OrderList.csv')
freight_rates = pd.read_csv('preprocessed_FreightRates.csv')
wh_costs = pd.read_csv('preprocessed_WhCosts.csv')
wh_capacities = pd.read_csv('preprocessed_WhCapacities.csv')
products_per_plant = pd.read_csv('preprocessed_ProductsPerPlant.csv')
vmi_customers = pd.read_csv('preprocessed_VmiCustomers.csv')
plant_ports = pd.read_csv('preprocessed_PlantPorts.csv')

In [3]:
# Define independent and dependent variables
indep_X = order_list[['Ship ahead day count', 'Ship Late Day count', 'Unit quantity', 'Weight']]
dep_Y = order_list[['Plant Code']]


In [4]:
# Encode target variable
label_encoder = LabelEncoder()
dep_Y_encoded = label_encoder.fit_transform(dep_Y.values.ravel())


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y_encoded, test_size=0.25, random_state=0)

In [6]:
# Scale the numeric features (X_train and X_test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def rfeFeature(X_train_scaled, y_train, n):
    rfelist = []
    colnames_list = []
    r2_values = []

    models = [
        LinearRegression(),
        DecisionTreeRegressor(random_state=0),
        RandomForestRegressor(n_estimators=10, random_state=0),
        XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)
    ]

    for model in models:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(X_train_scaled, y_train)
        log_rfe_feature = log_fit.transform(X_train_scaled)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(X_train.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        model.fit(X_train_scaled[:, log_rfe.support_], y_train)
        X_test_rfe = log_rfe.transform(X_test_scaled)
        r2 = model.score(X_test_rfe, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

In [7]:
# Call the function with the data
rfelist, colnames_list, r2_values = rfeFeature(X_train_scaled, y_train, 5)


In [8]:
# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: Linear
Selected Columns: ['Ship ahead day count', 'Ship Late Day count', 'Unit quantity', 'Weight']
R2 Value: 0.08251729791238316

Model: Decision
Selected Columns: ['Ship ahead day count', 'Ship Late Day count', 'Unit quantity', 'Weight']
R2 Value: 0.2033757461986977

Model: Random
Selected Columns: ['Ship ahead day count', 'Ship Late Day count', 'Unit quantity', 'Weight']
R2 Value: 0.5704262041604267

Model: XGBoost
Selected Columns: ['Ship ahead day count', 'Ship Late Day count', 'Unit quantity', 'Weight']
R2 Value: 0.5586863548363743

