# Forcasting Sales

This analysis predicts a full year worth of sales for various learning modules from different branded stores in different countries. The task is predicting sales during for year 2022.

In [43]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [44]:
from sklearn.model_selection import train_test_split # Split the dataset into training and testing sets. This is a common step in ML to assess how well the model generalizes to new data. 
from sklearn.ensemble import RandomForestRegressor   # A machine learning model for regression tasks based on an ensemble of decision trees. 
from sklearn.metrics import mean_absolute_error      # Evaluate the performance of a regression model by calculating the mean absolute difference between predicted and true values.
from sklearn.preprocessing import LabelEncoder       # Encoding categorical labels with integer values. Prepare categorical data for machine learning algorithms. 

### Load the training and test datasets

In [45]:
# Load the training data
# Specify the path to the CSV file
csv_file_path = "C:/Users/18490/Downloads/Kaggle_Forecasting Mini-CourseSales/train.csv"

# Load the dataset into a pandas DataFrame
train_data = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
print(train_data.head())

   id        date    country         store  \
0   0  2017-01-01  Argentina  Kaggle Learn   
1   1  2017-01-01  Argentina  Kaggle Learn   
2   2  2017-01-01  Argentina  Kaggle Learn   
3   3  2017-01-01  Argentina  Kaggle Learn   
4   4  2017-01-01  Argentina  Kaggle Learn   

                                          product  num_sold  
0               Using LLMs to Improve Your Coding        63  
1                   Using LLMs to Train More LLMs        66  
2  Using LLMs to Win Friends and Influence People         9  
3      Using LLMs to Win More Kaggle Competitions        59  
4                      Using LLMs to Write Better        49  


In [46]:
# Load the test data
test_data = pd.read_csv("C:/Users/18490/Downloads/Kaggle_Forecasting Mini-CourseSales/test.csv")

print(test_data.head())

       id        date    country         store  \
0  136950  2022-01-01  Argentina  Kaggle Learn   
1  136951  2022-01-01  Argentina  Kaggle Learn   
2  136952  2022-01-01  Argentina  Kaggle Learn   
3  136953  2022-01-01  Argentina  Kaggle Learn   
4  136954  2022-01-01  Argentina  Kaggle Learn   

                                          product  
0               Using LLMs to Improve Your Coding  
1                   Using LLMs to Train More LLMs  
2  Using LLMs to Win Friends and Influence People  
3      Using LLMs to Win More Kaggle Competitions  
4                      Using LLMs to Write Better  


In [47]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        136950 non-null  int64 
 1   date      136950 non-null  object
 2   country   136950 non-null  object
 3   store     136950 non-null  object
 4   product   136950 non-null  object
 5   num_sold  136950 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.3+ MB


RangeIndex: 136950 entries, 0 to 136949

In [48]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27375 entries, 0 to 27374
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       27375 non-null  int64 
 1   date     27375 non-null  object
 2   country  27375 non-null  object
 3   store    27375 non-null  object
 4   product  27375 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


  The columns of both DataFrames are the same. Therefore, two dataframe can be combined into 1 dataframe with a new index.

In [49]:
# Combine two dataframes into one
combined = pd.concat([train_data, test_data], ignore_index=True)

print(combined.head())

   id        date    country         store  \
0   0  2017-01-01  Argentina  Kaggle Learn   
1   1  2017-01-01  Argentina  Kaggle Learn   
2   2  2017-01-01  Argentina  Kaggle Learn   
3   3  2017-01-01  Argentina  Kaggle Learn   
4   4  2017-01-01  Argentina  Kaggle Learn   

                                          product  num_sold  
0               Using LLMs to Improve Your Coding      63.0  
1                   Using LLMs to Train More LLMs      66.0  
2  Using LLMs to Win Friends and Influence People       9.0  
3      Using LLMs to Win More Kaggle Competitions      59.0  
4                      Using LLMs to Write Better      49.0  


### Preprocessing

In [50]:
# Label encoding
le = LabelEncoder() # Encode categorical labels into numerical values. 

In [51]:
# Extracting information from the Date column
# Convert the Date column to a 'datetime' fomat
combined['date'] = pd.to_datetime(combined['date'])

In [72]:
# Extract year, month and day to be seperate columns
combined['year'] = combined['date'].dt.year.astype(int)
combined['month'] = combined['date'].dt.month.astype(int)
combined['day'] = combined['date'].dt.day.astype(int)
print(combined.head())

   id       date    country         store  \
0   0 2017-01-01  Argentina  Kaggle Learn   
1   1 2017-01-01  Argentina  Kaggle Learn   
2   2 2017-01-01  Argentina  Kaggle Learn   
3   3 2017-01-01  Argentina  Kaggle Learn   
4   4 2017-01-01  Argentina  Kaggle Learn   

                                          product  num_sold  year  month  day  
0               Using LLMs to Improve Your Coding      63.0  2017      1    1  
1                   Using LLMs to Train More LLMs      66.0  2017      1    1  
2  Using LLMs to Win Friends and Influence People       9.0  2017      1    1  
3      Using LLMs to Win More Kaggle Competitions      59.0  2017      1    1  
4                      Using LLMs to Write Better      49.0  2017      1    1  


In [73]:
# Remove Date column
combined.drop(columns = ['date'],inplace=True)
print(combined.head())

   id    country         store  \
0   0  Argentina  Kaggle Learn   
1   1  Argentina  Kaggle Learn   
2   2  Argentina  Kaggle Learn   
3   3  Argentina  Kaggle Learn   
4   4  Argentina  Kaggle Learn   

                                          product  num_sold  year  month  day  
0               Using LLMs to Improve Your Coding      63.0  2017      1    1  
1                   Using LLMs to Train More LLMs      66.0  2017      1    1  
2  Using LLMs to Win Friends and Influence People       9.0  2017      1    1  
3      Using LLMs to Win More Kaggle Competitions      59.0  2017      1    1  
4                      Using LLMs to Write Better      49.0  2017      1    1  


In [74]:
# Country, store and product- label encoding - convert categorical labels into numerical values. 
combined['country'] = le.fit_transform(combined['country'])# Transform the values, assign numerical labels to the unique values.
combined['store'] = le.fit_transform(combined['store'])
combined['product'] = le.fit_transform(combined['product'])

### Machine Learning

In [75]:
# Split the combined data back into training and test sets.
train_data = combined[:len(train_data)]
print(train_data.head())

   id  country  store  product  num_sold  year  month  day
0   0        0      1        0      63.0  2017      1    1
1   1        0      1        1      66.0  2017      1    1
2   2        0      1        2       9.0  2017      1    1
3   3        0      1        3      59.0  2017      1    1
4   4        0      1        4      49.0  2017      1    1


In [76]:
test_data = combined[:len(test_data)]
print(test_data.head())

   id  country  store  product  num_sold  year  month  day
0   0        0      1        0      63.0  2017      1    1
1   1        0      1        1      66.0  2017      1    1
2   2        0      1        2       9.0  2017      1    1
3   3        0      1        3      59.0  2017      1    1
4   4        0      1        4      49.0  2017      1    1


In [77]:
# Split the training data into features ('X') and the target variable ('y')
# 'num_sold' is variable
# others in the df are features

# Create a new df X for features, dropping 'num_sold'

X = train_data.drop(columns=['num_sold'])

print(X.head())

   id  country  store  product  year  month  day
0   0        0      1        0  2017      1    1
1   1        0      1        1  2017      1    1
2   2        0      1        2  2017      1    1
3   3        0      1        3  2017      1    1
4   4        0      1        4  2017      1    1


In [78]:
# Create a new df y for the target variable, including only 'num_sold'

y = train_data['num_sold']

print(y.head())

0    63.0
1    66.0
2     9.0
3    59.0
4    49.0
Name: num_sold, dtype: float64


In [79]:
# Split the data into training and validation sets

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.25,random_state=136950) 
# X_train: features for training
# X_val: features for validation
# y_train: target variables for training
# y_val: target variables for validation

# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

X_train shape: (102712, 7)
X_val shape: (34238, 7)
y_train shape: (102712,)
y_val shape: (34238,)


In [62]:
# Train a random forest regressor model.
# Create a Random Forest Regressor model with 100 trees.

model = RandomForestRegressor(n_estimators=100, random_state=136950)

# Fit the model to the traning data.

model.fit(X_train, y_train)

# After running this code, the model variable will contain the trained Random Forest Regressor.

#### Use the trained model (model) to make predictions on a validation dataset 

In [63]:
# Use 'model.predict'(X_val) to obtain predictions for the validation set (X_val). 

y_pred = model.predict(X_val)


In [64]:
# Evaluate the model's performance using appropriate metrics, such as mean absolute error or mean squared error, on both the training and validation sets.
# Calculate mean absolute error.

mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 12.192227057655236


On average, this model's predictions have an absolute error of around 12.19 units.

In [80]:
# Display the columns in test_data
print("num_sold", test_data.columns)

num_sold Index(['id', 'country', 'store', 'product', 'num_sold', 'year', 'month',
       'day'],
      dtype='object')


In [81]:
# Use 'model.predict'to make predictions on the test set

test_predictions = model.predict(test_data.drop(columns=['num_sold']))

In [83]:
# Create a submission df
submission = pd.DataFrame({'id': test_data['id'], 'num_sold': test_predictions})

print(submission.head())

   id  num_sold
0   0     61.27
1   1     62.67
2   2      8.09
3   3     43.67
4   4     43.77


In [None]:
# Save the submission DataFrame to a CSV file
submission.to_csv('/kaggle/working/submission.csv', index=False)