In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Load the data
store = pd.read_csv('store.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv', low_memory=False)

In [2]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [3]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [4]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [5]:
# Merge the DataFrames on the 'Store' column
trainStore = pd.merge(train, store, on='Store')
trainStore.to_csv('trainStore.csv', index=False)

In [6]:
trainStore = trainStore.dropna()
trainStore.head()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
942,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
943,2,4,2015-07-30,5567,601,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
944,2,3,2015-07-29,6402,727,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
945,2,2,2015-07-28,5671,646,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
946,2,1,2015-07-27,6627,638,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
trainStore_encoded_label = trainStore.copy()

for column in trainStore_encoded_label.columns:
    if trainStore_encoded_label[column].dtype == 'object':
        trainStore_encoded_label[column] = label_encoder.fit_transform(trainStore_encoded_label[column])

trainStore_encoded_label.head()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
942,2,5,941,6064,625,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1
943,2,4,940,5567,601,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1
944,2,3,939,6402,727,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1
945,2,2,938,5671,646,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1
946,2,1,937,6627,638,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1


In [8]:
trainStore_encoded_label.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the KNN model
knn = KNeighborsRegressor()

# Fit the model on the training data
knn.fit(X_train, y_train)

# Predict the sales using the trained model on the testing data
y_pred = knn.predict(X_test)

accuracy = knn.score(X_test, y_test)
print("Accuracy:", accuracy)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)



Accuracy: 0.9650369559667998
Mean Squared Error (MSE): 426792.36811765796


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the KNN model
knn = KNeighborsRegressor()

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Predict the sales using the trained KNN model on the testing data
y_pred_knn = knn.predict(X_test)

# Calculate KNN model metrics
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Create an instance of the Gradient Boosting model
gradient_boost = GradientBoostingRegressor()

# Fit the Gradient Boosting model on the training data
gradient_boost.fit(X_train, y_train)

# Predict the sales using the trained Gradient Boosting model on the testing data
y_pred_gb = gradient_boost.predict(X_test)

# Calculate Gradient Boosting model metrics
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print KNN model metrics
print("KNN Model Metrics:")
print("Mean Squared Error (MSE):", mse_knn)
print("R-squared (R2):", r2_knn)

# Print Gradient Boosting model metrics
print("\nGradient Boosting Model Metrics:")
print("Mean Squared Error (MSE):", mse_gb)
print("R-squared (R2):", r2_gb)


KNN Model Metrics:
Mean Squared Error (MSE): 426792.36811765796
R-squared (R2): 0.9650369559667998

Gradient Boosting Model Metrics:
Mean Squared Error (MSE): 668945.7941485643
R-squared (R2): 0.9451996263199518


In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd


# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the KNN model
knn = KNeighborsRegressor()

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Predict the sales using the trained KNN model on the testing data
y_pred_knn = knn.predict(X_test)

# Calculate KNN model metrics
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5]
}

# Create an instance of the Gradient Boosting model
gradient_boost = GradientBoostingRegressor()

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(gradient_boost, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create an instance of the Gradient Boosting model
best_gradient_boost = GradientBoostingRegressor(**best_params)

# Fit the Gradient Boosting model on the training data
best_gradient_boost.fit(X_train, y_train)

# Predict the sales using the trained Gradient Boosting model on the testing data
y_pred_gb = best_gradient_boost.predict(X_test)

# Calculate Gradient Boosting model metrics
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print KNN model metrics
print("KNN Model Metrics:")
print("Mean Squared Error (MSE):", mse_knn)
print("R-squared (R2):", r2_knn)

# Print Gradient Boosting model metrics
print("\nGradient Boosting Model Metrics:")
print("Mean Squared Error (MSE):", mse_gb)
print("R-squared (R2):", r2_gb)


KNN Model Metrics:
Mean Squared Error (MSE): 426792.36811765796
R-squared (R2): 0.9650369559667998

Gradient Boosting Model Metrics:
Mean Squared Error (MSE): 156661.61182238068
R-squared (R2): 0.9871662024871354


In [12]:
#apply the best params to the model and predict the sales
best_gradient_boost = GradientBoostingRegressor(**best_params)
best_gradient_boost.fit(X_train, y_train)
y_pred = best_gradient_boost.predict(X_test)
accuracy = best_gradient_boost.score(X_test, y_test)
print("Accuracy:", accuracy)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)



Accuracy: 0.9871442649520855
Mean Squared Error (MSE): 156929.4023650378
R-squared (R2): 0.9871442649520855


In [13]:
#create a random forest model and get best params using gridsearch
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}

# Create an instance of the Random Forest model
random_forest = RandomForestRegressor()

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(random_forest, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create an instance of the Random Forest model
best_random_forest = RandomForestRegressor(**best_params)

# Fit the Random Forest model on the training data
best_random_forest.fit(X_train, y_train)

# Predict the sales using the trained Random Forest model on the testing data
y_pred_rf = best_random_forest.predict(X_test)

# Calculate Random Forest model metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print Random Forest model metrics
print("Random Forest Model Metrics:")
print("Mean Squared Error (MSE):", mse_rf)
print("R-squared (R2):", r2_rf)




Random Forest Model Metrics:
Mean Squared Error (MSE): 1440714.3001445038
R-squared (R2): 0.881975964712361


In [14]:
#make a decision tree model and get best params using gridsearch
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataframe into input features (X) and target variable (y)
X = trainStore_encoded_label.drop('Sales', axis=1)
y = trainStore_encoded_label['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Decision Tree
param_grid = {
    'max_depth': [3, 4, 5]
}

# Create an instance of the Decision Tree model
decision_tree = DecisionTreeRegressor()

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create an instance of the Decision Tree model
best_decision_tree = DecisionTreeRegressor(**best_params)

# Fit the Decision Tree model on the training data
best_decision_tree.fit(X_train, y_train)

# Predict the sales using the trained Decision Tree model on the testing data
y_pred_dt = best_decision_tree.predict(X_test)

# Calculate Decision Tree model metrics
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Print Decision Tree model metrics
print("Decision Tree Model Metrics:")
print("Mean Squared Error (MSE):", mse_dt)
print("R-squared (R2):", r2_dt)


Decision Tree Model Metrics:
Mean Squared Error (MSE): 1528179.6760459435
R-squared (R2): 0.8748107574184493
