In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [None]:
# Load the data
data_oil = pd.read_csv('oil.csv')
data_holidays = pd.read_csv('holidays_events.csv')
data_stores = pd.read_csv('stores.csv')
data_train = pd.read_csv('train.csv')
data_transactions = pd.read_csv('transactions.csv')
data_test = pd.read_csv('test.csv')

In [None]:
# Create columns for year, month, day, and day_of_week
for data in [data_oil, data_holidays, data_stores, data_train, data_transactions, data_test]:
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
        data['year'] = data['date'].dt.year
        data['month'] = data['date'].dt.month
        data['day'] = data['date'].dt.day
        data['day_of_week'] = data['date'].dt.dayofweek
        data['day_name'] = data['date'].dt.strftime('%A')


In [None]:
# Merge datasets
data_train = data_train.merge(data_oil[['date', 'dcoilwtico']], on='date', how='left')
data_train = data_train.merge(data_holidays.rename(columns={'type': 'holiday_type'}), on=['date', 'day', 'month', 'year', 'day_of_week', 'day_name'], how='left')
data_train = data_train.merge(data_stores.rename(columns={'type': 'store_type'}), on='store_nbr', how='left')
data_train = data_train.merge(data_transactions, on=['date', 'store_nbr', 'day', 'month', 'year', 'day_of_week', 'day_name'], how='left')

data_test = data_test.merge(data_oil[['date', 'dcoilwtico']], on='date', how='left')
data_test = data_test.merge(data_holidays.rename(columns={'type': 'holiday_type'}), on=['date', 'day', 'month', 'year', 'day_of_week', 'day_name'], how='left')
data_test = data_test.merge(data_stores.rename(columns={'type': 'store_type'}), on='store_nbr', how='left')
data_test = data_test.merge(data_transactions, on=['date', 'store_nbr', 'day', 'month', 'year', 'day_of_week', 'day_name'], how='left')


Model Building and Training

In [None]:
# Feature selection and encoding
features = ['store_nbr', 'family', 'onpromotion', 'year', 'month', 'day', 'day_of_week', 'day_name', 'dcoilwtico', 'holiday_type', 'locale', 'locale_name', 'description', 'transferred', 'city', 'state', 'store_type', 'cluster', 'transactions']
data_train_encoded = pd.get_dummies(data_train[features], drop_first=True)

target = 'sales'
X = data_train_encoded
y = data_train[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost regressor
xgb_model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)



In [None]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")


In [None]:
# Plot Feature Importance
xgb.plot_importance(xgb_model, importance_type='weight', max_num_features=10)
plt.title('Top 10 Feature Importances')
plt.show()


In [None]:
# Plot Actual vs. Predicted Sales
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')

# Set x-axis and y-axis limits to focus on the relevant range
plt.xlim(-1000, 35000)  # Adjust this range as needed
plt.ylim(-1000, 21000)  # Adjust this range as needed

plt.savefig('sales_prediction_graph.png')

plt.show()

In [None]:
# Plot Error Distribution
errors = y_test - y_pred
sns.histplot(errors, kde=True, color='blue')
plt.title('Error Distribution (Residuals)')
plt.xlabel('Error')
plt.ylabel('Frequency')


# Set x-axis and y-axis limits to focus on the relevant range
plt.xlim(-15000, 20000)  # Adjust this range as needed
plt.ylim(0, 400)  # Adjust this range as needed

plt.savefig('Error_Distribution_(Residuals)_graph.png')

plt.show()


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import random
import matplotlib.pyplot as plt

# Load test data and train data
data_test = pd.read_csv('test.csv')
data_train = pd.read_csv('train.csv')

# Feature selection and encoding for the data
features = ['store_nbr', 'family', 'onpromotion', 'year', 'month', 'day', 'day_of_week', 'day_name',
            'dcoilwtico', 'holiday_type', 'locale', 'locale_name', 'description', 'transferred',
            'city', 'state', 'store_type', 'cluster', 'transactions']

# Process train data
data_train_encoded = pd.get_dummies(data_train[features], drop_first=True)
target = 'sales'
X_train = data_train_encoded
y_train = data_train[target]

# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Forecasting for the next month (next 30 days)
# Generate future data (you can create a future DataFrame, similar to your training set but with future date information)
# For example, simulate 30 days of future data:
future_dates = pd.date_range(start='2024-12-01', periods=30, freq='D')
future_data = []

for date in future_dates:
    future_data.append({
        'date': date,
        'store_nbr': random.choice(data_train['store_nbr'].unique()),  # Random store number
        'family': random.choice(data_train['family'].unique()),  # Random family (product type)
        'onpromotion': random.choice([0, 1]),  # Random promotion status
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'day_of_week': date.weekday(),
        'day_name': date.strftime('%A'),
        'dcoilwtico': random.choice(data_train['dcoilwtico'].dropna()),  # Random oil price (for simplicity)
        'holiday_type': None,  # Assuming no holiday effect
        'locale': random.choice(data_train['locale'].unique()),
        'locale_name': random.choice(data_train['locale_name'].unique()),
        'description': random.choice(data_train['description'].unique()),
        'transferred': random.choice([0, 1]),
        'city': random.choice(data_train['city'].unique()),
        'state': random.choice(data_train['state'].unique()),
        'store_type': random.choice(data_train['store_type'].unique()),
        'cluster': random.choice(data_train['cluster'].unique()),
        'transactions': random.choice(data_train['transactions'].dropna())  # Random transaction value
    })

# Convert future data to DataFrame and encode it as needed
future_df = pd.DataFrame(future_data)
future_df_encoded = pd.get_dummies(future_df[features], drop_first=True)

# Predict sales for the next month
sales_forecast = xgb_model.predict(future_df_encoded)

# Show forecast results
future_df['forecasted_sales'] = sales_forecast
print(future_df[['date', 'store_nbr', 'family', 'forecasted_sales']])

# Step 2: Select a random store and forecast the top 10 products
# Randomly select a store
random_store = random.choice(data_test['store_nbr'].unique())

# Filter data for this store
store_data = data_train[data_train['store_nbr'] == random_store]

# Find the top 10 most frequent product families in this store
top_10_products = store_data['family'].value_counts().nlargest(10).index.tolist()

# Prepare future data for the top 10 products for the selected store
future_top_10_data = []
for product in top_10_products:
    for date in future_dates:
        future_top_10_data.append({
            'date': date,
            'store_nbr': random_store,
            'family': product,
            'onpromotion': random.choice([0, 1]),
            'year': date.year,
            'month': date.month,
            'day': date.day,
            'day_of_week': date.weekday(),
            'day_name': date.strftime('%A'),
            'dcoilwtico': random.choice(data_train['dcoilwtico'].dropna()),
            'holiday_type': None,
            'locale': random.choice(data_train['locale'].unique()),
            'locale_name': random.choice(data_train['locale_name'].unique()),
            'description': random.choice(data_train['description'].unique()),
            'transferred': random.choice([0, 1]),
            'city': random.choice(data_train['city'].unique()),
            'state': random.choice(data_train['state'].unique()),
            'store_type': random.choice(data_train['store_type'].unique()),
            'cluster': random.choice(data_train['cluster'].unique()),
            'transactions': random.choice(data_train['transactions'].dropna())
        })

# Convert future data for the top 10 products to DataFrame and encode it
future_top_10_df = pd.DataFrame(future_top_10_data)
future_top_10_df_encoded = pd.get_dummies(future_top_10_df[features], drop_first=True)

# Predict sales for the top 10 products for the next 30 days
top_10_sales_forecast = xgb_model.predict(future_top_10_df_encoded)

# Add the forecasted sales to the DataFrame
future_top_10_df['forecasted_sales'] = top_10_sales_forecast

# Show the forecasted sales for top 10 products
print(future_top_10_df[['date', 'store_nbr', 'family', 'forecasted_sales']])
