## **Step - 1**
# **Data Pre-Proprocessing**
---
<p>-> In the first step, we are going to prepare a dataset</p>
<p> -> After collecting a dataset we should make it usable for training</p>
<p> -> This was sub-divided into steps</p>



In [4]:
import pandas as pd

data = pd.read_csv('data.csv')

if 'SUM(SALES_VALUE_TOT)' in data.columns:
    data.drop(columns=['SUM(SALES_VALUE_TOT)'], inplace=True)
data.to_csv('modified_dataset.csv', index=False)


In [6]:
import pandas as pd
df = pd.read_csv('modified_dataset.csv')
df['YEAR_MONTH'] = pd.to_datetime(df['YEAR_MONTH'])
df['YEAR_MONTH'] = df['YEAR_MONTH'].dt.strftime('%B')
print(df)
df.to_csv('modified_dataset_month.csv', index=False)


       Unnamed: 0 YEAR_MONTH    MATERIAL_CODE  SUM(A.SALES_QTY_TOT)
0             NaN        May  451100036400700                380.65
1             NaN        May  171612565130200               3251.36
2             NaN        May  471307508130200               1151.56
3             NaN        May  171409090410100                188.10
4             NaN        May        611100716               1178.28
...           ...        ...              ...                   ...
19100         NaN    October  505006615700668                342.00
19101         NaN       June        614330200                 28.90
19102         NaN       June  171407777410300                 31.65
19103         NaN   February        614100600                105.90
19104         NaN   February  451200020800305                 58.92

[19105 rows x 4 columns]


In [7]:
import pandas as pd

df = pd.read_csv('modified_dataset_month.csv')
df = df.iloc[:, 1:]
df.to_csv('modified_dataset_shifted.csv', index=False)


In [8]:
import pandas as pd
df = pd.read_csv('modified_dataset_shifted.csv')
df['MATERIAL_CODE'] = df['MATERIAL_CODE'].sort_values()
df.reset_index(drop=True, inplace=True)
df.to_csv('sorted_dataset.csv', index=False)


## **Step - 2**
# **Training**
---

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Load your dataset
df = pd.read_csv('sorted_dataset.csv')

# Encode the categorical columns (YEAR_MONTH and MATERIAL_CODE)
label_encoder_month = LabelEncoder()
label_encoder_material = LabelEncoder()

df['YEAR_MONTH'] = label_encoder_month.fit_transform(df['YEAR_MONTH'])
df['MATERIAL_CODE'] = label_encoder_material.fit_transform(df['MATERIAL_CODE'])

# Split the dataset into training and testing sets
X = df[['YEAR_MONTH', 'MATERIAL_CODE']]
y = df['SUM(A.SALES_QTY_TOT)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train an XGBoost regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Now, you can use this trained model to make predictions for new data.
# You can also save the model for future use using joblib or pickle.


Mean Squared Error: 29163749.513326317


In [29]:
from sklearn.metrics import mean_absolute_error, r2_score

# Calculate additional evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")


Mean Squared Error (MSE): 29163749.513326317
Mean Absolute Error (MAE): 1230.8926465112654
R-squared (R2) Score: 0.6621809247453083


In [30]:
import joblib

# Save the trained model to a file
model_filename = 'xgboost_regression_model.pkl'
joblib.dump(model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to xgboost_regression_model.pkl


In [33]:
# Example code to create and save label encoders for YEAR_MONTH and MATERIAL_CODE
from sklearn.preprocessing import LabelEncoder
import joblib

# Load your dataset
df = pd.read_csv('sorted_dataset.csv')

# Create label encoders for YEAR_MONTH and MATERIAL_CODE
label_encoder_month = LabelEncoder()
label_encoder_material = LabelEncoder()

label_encoder_month.fit(df['YEAR_MONTH'])
label_encoder_material.fit(df['MATERIAL_CODE'])

# Save label encoders to files
joblib.dump(label_encoder_month, 'label_encoder_month.pkl')
joblib.dump(label_encoder_material, 'label_encoder_material.pkl')


['label_encoder_material.pkl']

In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the saved model
model_filename = 'xgboost_regression_model.pkl'
loaded_model = joblib.load(model_filename)

# Prepare your data for prediction
# Example data for prediction (you can replace this with your own data)
data_to_predict = pd.DataFrame({
    'YEAR_MONTH': ['January', 'February'],
    'MATERIAL_CODE': ['451100036400700', '171612565130200']
})

# Encode the categorical columns (YEAR_MONTH and MATERIAL_CODE)
# Use the same label encoders you used during training
data_to_predict['YEAR_MONTH'] = label_encoder_month.transform(data_to_predict['YEAR_MONTH'])
data_to_predict['MATERIAL_CODE'] = label_encoder_material.transform(data_to_predict['MATERIAL_CODE'])

# Make predictions using the loaded model
predictions = loaded_model.predict(data_to_predict)

# Print the predictions
print("Predicted SUM(A.SALES_QTY_TOT):")
for prediction in predictions:
    print(prediction)


Predicted SUM(A.SALES_QTY_TOT):
274.04794
2556.5442
