In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Load the data
store = pd.read_csv('store.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv', low_memory=False)

In [2]:
# Merge the DataFrames on the 'Store' column
trainStore = pd.merge(train, store, on='Store')
trainStore.to_csv('trainStore.csv', index=False)

In [3]:
trainStore = trainStore.dropna()

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
trainStore_encoded_label = trainStore.copy()

for column in trainStore_encoded_label.columns:
    if trainStore_encoded_label[column].dtype == 'object':
        trainStore_encoded_label[column] = label_encoder.fit_transform(trainStore_encoded_label[column])



In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd


# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the KNN model
knn = KNeighborsRegressor()

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Predict the sales using the trained KNN model on the testing data
y_pred_knn = knn.predict(X_test)

# Calculate KNN model metrics
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5]
}

# Create an instance of the Gradient Boosting model
gradient_boost = GradientBoostingRegressor()

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(gradient_boost, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create an instance of the Gradient Boosting model
best_gradient_boost = GradientBoostingRegressor(**best_params)

# Fit the Gradient Boosting model on the training data
best_gradient_boost.fit(X_train, y_train)

# Predict the sales using the trained Gradient Boosting model on the testing data
y_pred_gb = best_gradient_boost.predict(X_test)

# Calculate Gradient Boosting model metrics
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print KNN model metrics
print("KNN Model Metrics:")
print("Mean Squared Error (MSE):", mse_knn)
print("R-squared (R2):", r2_knn)

# Print Gradient Boosting model metrics
print("\nGradient Boosting Model Metrics:")
print("Mean Squared Error (MSE):", mse_gb)
print("R-squared (R2):", r2_gb)


KNN Model Metrics:
Mean Squared Error (MSE): 426792.36811765796
R-squared (R2): 0.9650369559667998

Gradient Boosting Model Metrics:
Mean Squared Error (MSE): 157082.49578983596
R-squared (R2): 0.9871317234622364


In [6]:
#apply the best params to the model and predict the sales
best_gradient_boost = GradientBoostingRegressor(**best_params)
best_gradient_boost.fit(X_train, y_train)
y_pred = best_gradient_boost.predict(X_test)
accuracy = best_gradient_boost.score(X_test, y_test)
print("Accuracy:", accuracy)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)



Accuracy: 0.9871731308771833
Mean Squared Error (MSE): 156577.03726437106
R-squared (R2): 0.9871731308771833


In [11]:
#predict the sales in the test data
testStore = pd.merge(test, store, on='Store')
testStore = testStore.dropna()
testStore_encoded_label = testStore.copy()
for column in testStore_encoded_label.columns:
    if testStore_encoded_label[column].dtype == 'object':
        testStore_encoded_label[column] = label_encoder.fit_transform(testStore_encoded_label[column])

X_test = testStore_encoded_label
y_pred = best_gradient_boost.predict(X_test)
testStore['Sales'] = y_pred
testStore[['Id', 'Sales']].to_csv('submission.csv', index=False)

# Plot the feature importance
feature_importance = best_gradient_boost.feature_importances_
feature_names = X.columns
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Gradient Boosting Feature Importance')
plt.show()

# Plot the actual vs predicted sales
plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_pred)
plt.plot([0, 1], [0, 1], '--k')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.show()

    

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Id
Feature names seen at fit time, yet now missing:
- Customers
