In [None]:
import pandas as pd

# Load the test dataset
test_data = pd.read_csv("/content/data.csv")

# Display the first few rows and column information
print(test_data.head())
print(test_data.info())


  ProductType Manufacturer Area Code Sourcing Channel Product Size  \
0        NTM3           X1       A28        WHOLESALE        Large   
1        NTM2           X1        A9           DIRECT        Large   
2        NTM3           X2       A20           DIRECT        Large   
3        NTM3           X1       A18        WHOLESALE        Small   
4        NTM2           X1       A28           DIRECT        Large   

  Product Type Month of Sourcing  Sourcing Cost  
0       Powder            May-21          10.16  
1       Powder            Oct-20         134.28  
2       Powder            Dec-20          12.46  
3       Powder            Feb-21         107.22  
4       Liquid            Nov-20         197.76  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550176 entries, 0 to 550175
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ProductType        550176 non-null  object 
 1   Manufacturer     

Step 2: Exploratory Data Analysis (EDA)


In [None]:
# Check for missing values
print(test_data.isnull().sum())

# Handle missing values (if any)
# For example, fill missing values with mean or median
test_data['Sourcing Cost'].fillna(test_data['Sourcing Cost'].median(), inplace=True)

# Check for outliers
# Visualize distributions or use statistical methods like Z-score or IQR to detect outliers
# Decide on how to handle outliers (e.g., removal, transformation)


ProductType          0
Manufacturer         0
Area Code            0
Sourcing Channel     0
Product Size         0
Product Type         0
Month of Sourcing    0
Sourcing Cost        0
dtype: int64


Step 3: Feature Encoding

In [None]:
# Encode categorical variables using one-hot encoding
test_data_encoded = pd.get_dummies(test_data, columns=['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type'])

# Display the updated DataFrame with encoded features
print(test_data_encoded.head())


  Month of Sourcing  Sourcing Cost  ProductType_NTM1  ProductType_NTM2  \
0            May-21          10.16             False             False   
1            Oct-20         134.28             False              True   
2            Dec-20          12.46             False             False   
3            Feb-21         107.22             False             False   
4            Nov-20         197.76             False              True   

   ProductType_NTM3  Manufacturer_X1  Manufacturer_X2  Manufacturer_X3  \
0              True             True            False            False   
1             False             True            False            False   
2              True            False             True            False   
3              True             True            False            False   
4             False             True            False            False   

   Area Code_A1  Area Code_A10  ...  Area Code_A9  Sourcing Channel_DIRECT  \
0         False          False  

Step 4: Train Different Machine Learning Models
We'll train and evaluate the following models for predicting Sourcing Cost:

Linear Regression
Random Forest
Gradient Boosting (e.g., XGBoost)
We'll use these models to make predictions and compare their performance.

1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Separate features (X) and target variable (y)
X = test_data_encoded.drop(columns=['Month of Sourcing', 'Sourcing Cost'])
y = test_data_encoded['Sourcing Cost']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Make predictions
y_pred_linear = linear_reg.predict(X_test)

# Evaluate the model
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

print("Linear Regression Model:")
print(f"Root Mean Squared Error (RMSE): {rmse_linear}")
print(f"Mean Absolute Error (MAE): {mae_linear}")


Linear Regression Model:
Root Mean Squared Error (RMSE): 57.700506152593704
Mean Absolute Error (MAE): 20.771198744047407


3. Gradient Boosting (XGBoost)

In [None]:
import xgboost as xgb

# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print("\nXGBoost Model:")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb}")
print(f"Mean Absolute Error (MAE): {mae_xgb}")



XGBoost Model:
Root Mean Squared Error (RMSE): 56.09824154591503
Mean Absolute Error (MAE): 16.25264462101416


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA

# Assuming 'df' is your time series data with 'Month of Sourcing' and 'Sourcing Cost'
# Convert 'Month of Sourcing' to datetime if necessary
df['Month of Sourcing'] = pd.to_datetime(df['Month of Sourcing'])

# Set 'Month of Sourcing' as the index
df.set_index('Month of Sourcing', inplace=True)

# Resample monthly and fill missing values if any
df_monthly = df.resample('M').mean().fillna(method='ffill')

# Split data into train and test sets
train_data = df_monthly.loc['2020-07-01':'2021-05-31', 'Sourcing Cost']
test_data = df_monthly.loc['2021-06-01':, 'Sourcing Cost']

# Fit ARIMA model
model = ARIMA(train_data, order=(1, 1, 1))  # Example order, adjust as needed
arima_model = model.fit()

# Forecast
forecast, stderr, conf_int = arima_model.forecast(len(test_data))

# Evaluate the forecast
rmse_arima = np.sqrt(np.mean((forecast - test_data)**2))
mae_arima = np.mean(np.abs(forecast - test_data))

print("\nARIMA Model:")
print(f"Root Mean Squared Error (RMSE): {rmse_arima}")
print(f"Mean Absolute Error (MAE): {mae_arima}")

# Plot actual vs. forecasted values
plt.figure(figsize=(10, 6))
plt.plot(df_monthly.index, df_monthly['Sourcing Cost'], label='Actual')
plt.plot(test_data.index, forecast, label='Forecast', linestyle='--')
plt.title('ARIMA Forecasting')
plt.xlabel('Date')
plt.ylabel('Sourcing Cost')
plt.legend()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR  # Import SVR class from sklearn.svm
import xgboost as xgb


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

# Load the test dataset (assuming test_data_encoded is already prepared)
# Replace this with your test dataset loading code if necessary

# Separate features (X) and target variable (y)
X = test_data_encoded.drop(columns=['Month of Sourcing', 'Sourcing Cost'])
y = test_data_encoded['Sourcing Cost']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Make predictions with Linear Regression
y_pred_linear = linear_reg.predict(X_test)

# Evaluate Linear Regression model
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions with XGBoost
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost model
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

# Compare model performance
models = ['Linear Regression', 'XGBoost']
rmse_scores = [rmse_linear, rmse_xgb]
mae_scores = [mae_linear, mae_xgb]

model_comparison = pd.DataFrame({
    'Model': models,
    'RMSE': rmse_scores,
    'MAE': mae_scores
})

print("\nModel Comparison:")
print(model_comparison)

# Select the best model based on RMSE (lower is better)
best_model_index = model_comparison['RMSE'].idxmin()
best_model_name = model_comparison.loc[best_model_index, 'Model']
print(f"\nBest Model: {best_model_name}")



Model Comparison:
               Model       RMSE        MAE
0  Linear Regression  57.700506  20.771199
1            XGBoost  56.098242  16.252645

Best Model: XGBoost


Approaches for forecasting the June 2021 test set (Sourcing Cost) can vary depending on the characteristics of your dataset and the nature of the prediction task. Below are some common approaches you might consider:

### Approaches for Forecasting:
1. **Time Series Forecasting Techniques**:
   - Use methods like ARIMA (AutoRegressive Integrated Moving Average) or SARIMA (Seasonal ARIMA) if the data exhibits time-dependent patterns and seasonality.
   - Apply more advanced time series models like Prophet (from Facebook) or LSTM (Long Short-Term Memory) networks for capturing complex temporal dependencies.

2. **Regression Models**:
   - Utilize linear regression, polynomial regression, or regularized regression (e.g., Ridge, Lasso) if the relationship between features and target is primarily linear.
   - Employ ensemble techniques like Random Forest or Gradient Boosting (e.g., XGBoost) for more robust performance and handling non-linear relationships.

3. **Support Vector Machines (SVM)**:
   - SVM can be effective for regression tasks, particularly when dealing with non-linear relationships between features and target variables.

4. **Neural Networks**:
   - Develop feedforward neural networks (e.g., Multi-layer Perceptron) or more complex architectures (e.g., deep neural networks) to capture intricate patterns in the data.

5. **Ensemble Methods**:
   - Combine predictions from multiple models (e.g., model averaging, stacking) to leverage diverse strengths and improve overall performance.

### Comparison of Approaches:
- **Time Series vs. Regression**:
  - Time series models are suitable for data with inherent time dependencies and seasonality.
  - Regression models can capture broader patterns and relationships but may overlook time-specific nuances.

- **Linear Regression vs. Ensemble Models**:
  - Linear regression is straightforward and interpretable but may underperform for complex data.
  - Ensemble models like Random Forest and XGBoost are robust, handle non-linearity well, and often yield higher accuracy.

- **SVM vs. Neural Networks**:
  - SVM is effective for small to medium-sized datasets with non-linear relationships.
  - Neural networks excel in capturing complex patterns but require more data and computational resources.

### Final Approach and Justification:
For this specific task of forecasting Sourcing Cost for June 2021, the chosen approach involves training and evaluating multiple regression-based models (e.g., Linear Regression, XGBoost) along with SVM for comparison. These models are well-suited for the task given the structured nature of the dataset and the focus on predicting a continuous target variable.

- **Linear Regression**:
  - Simple and interpretable, suitable for capturing linear relationships.

- **XGBoost (Gradient Boosting)**:
  - Robust ensemble method, capable of handling non-linear relationships and complex interactions.

- **Support Vector Machine (SVM)**:
  - Effective for capturing non-linear patterns and suitable for regression tasks.

The final approach involves evaluating these models based on performance metrics (e.g., RMSE, MAE) and selecting the best-performing model for forecasting the June 2021 Sourcing Cost. This approach balances simplicity, accuracy, and model complexity, ensuring a thorough comparison to identify the most suitable model for the dataset and prediction task.

In the notebook, detailed explanations will accompany each model's implementation, evaluation, and comparison, providing insights into their strengths, weaknesses, and suitability for the forecasting task based on empirical results and domain considerations.