<!-- ## Ironhack Kaggle 

### Machine Learning - Predict Sales for Various Stores

### Libraries  -->

# Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the dataset

<!-- ### Loading the dataset -->

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/sales.csv")
df.head(10)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

<!-- ### Preprocessing -->

In [None]:
# Converting to date
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Year"] = df["Date"].dt.year
df["Day"] = df["Date"].dt.day
df["WeekOfYear"] = df["Date"].dt.isocalendar().week
df["Weekday"] = df["Date"].dt.weekday # 0 will be Monday and 6 = Sunday.

In [None]:
# sns.pairplot(df, hue="Sales")
# plt.show()

<!-- Now I am going to use dummies -->

In [None]:
# Categorical
df = pd.get_dummies(df, columns=["State_holiday"], drop_first=True)

# Dropping all unnecessary columns
df.drop(columns=["Date"], inplace=True)

<!-- Splitting Features and Target -->

In [None]:
features = df.drop(columns=["Sales"])
target = df["Sales"]

## Train-Test Split

<!-- ### Train-Test Split
Now performing the division between Train and Test, we will reserve 20% of our data to Test -->

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

# Model Selections 
I am trying out different models to review which one predicts the best.

Normalize Features

In [None]:
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Model Training & Evaluation

In [43]:
models = {
    "RandomForest": RandomForestRegressor(random_state=0),
    "GradientBoosting": GradientBoostingRegressor(random_state=0),
    "KNN": KNeighborsRegressor(n_neighbors=10)}

results = []

for name, model, in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MSE": mse, "RMSE": rmse, "R2": r2 })

# Displaying the results 
results_df = pd.DataFrame(results).sort_values(by="RMSE")
print(results_df)

KeyboardInterrupt: 

### Reviewing With KNN Regressor

In [None]:
knnR = KNeighborsRegressor(n_neighbors=10)

In [None]:
knnR.fit(x_train, y_train)

In [None]:
knnR.score(x_test, y_test)

In [None]:
x_train.head()

In [None]:
y_train.head()

<!-- ### Random Forest Model -->

In [None]:
# Using Random Forest to start with a simple model
model = RandomForestRegressor(random_state=0)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

# Comparison with the Models 

In [None]:
# results = pd.DataFrame({
#     "Model": ["RandomForest", "KNN Regressor"],
#     "MSE": [rf_mse, knn_mse],
#     "RMSE": [rf_rmse, knn_rmse],
#     "R2": [rf_r2, knn_r2]
# })
# display(results.sort_values("RMSE"))

<!-- ### Testing Model -->

# Testing the Model

In [None]:
testing_df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/ironkaggle_notarget.csv")

In [None]:
testing_df

In [None]:
# testing_df["Date"] = pd.to_datetime(testing_df["Date"])
# testing_df["Month"] = testing_df["Date"].dt.month
# testing_df = pd.get_dummies(testing_df, columns=["State_holiday"], drop_first=True)

In [None]:
# features1 = testing_df.drop(columns=["Sales", "Date"])
# target1 = testing_df["Sales"]