In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
# Load the dataset
data = pd.read_csv("sales.csv")

In [59]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           640840 non-null  int64 
 1   store_ID             640840 non-null  int64 
 2   day_of_week          640840 non-null  int64 
 3   date                 640840 non-null  object
 4   nb_customers_on_day  640840 non-null  int64 
 5   open                 640840 non-null  int64 
 6   promotion            640840 non-null  int64 
 7   state_holiday        640840 non-null  object
 8   school_holiday       640840 non-null  int64 
 9   sales                640840 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 48.9+ MB
None
          Unnamed: 0       store_ID    day_of_week  nb_customers_on_day  \
count  640840.000000  640840.000000  640840.000000        640840.000000   
mean   355990.675084     558.211348       4.000189           633.398577   
std    2

In [60]:
print("Missing values per column:")
print(data.isnull().sum())

Missing values per column:
Unnamed: 0             0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64


In [61]:
# Feature Engineering (Example: Create a 'month' feature)
data['date'] = pd.to_datetime(data['date'])
data['month'] = data['date'].dt.month

In [62]:
# Handling missing values
print("Missing values per column:")
print(data.isnull().sum())

Missing values per column:
Unnamed: 0             0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
month                  0
dtype: int64


# Convert 'state_holiday' to numeric


In [63]:
data['state_holiday'] = data['state_holiday'].astype(str)
data = pd.get_dummies(data, columns=['state_holiday'], drop_first=True)

# Check unique values for validation

In [64]:
print("State holiday columns after encoding:")
print(data.filter(like='state_holiday').head())

State holiday columns after encoding:
   state_holiday_a  state_holiday_b  state_holiday_c
0            False            False            False
1            False            False            False
2            False            False            False
3            False            False            False
4            False            False            False


# Handle categorical variables in 'day_of_week', 'open', 'promotion', 'school_holiday'


In [65]:
data['day_of_week'] = data['day_of_week'].astype(int)
data['open'] = data['open'].astype(int)
data['promotion'] = data['promotion'].astype(int)
data['school_holiday'] = data['school_holiday'].astype(int)

In [66]:
Q1 = data['sales'].quantile(0.25)
Q3 = data['sales'].quantile(0.75)
IQR = Q3 - Q1

In [67]:
# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out extreme outliers
data = data[(data['sales'] >= lower_bound) & (data['sales'] <= upper_bound)]
print(f"Data after outlier removal: {data.shape}")

Data after outlier removal: (624019, 13)


In [68]:
# Drop unnecessary columns, if any (e.g., 'date' if not used in modeling)
data = data.drop(columns=['date'])

In [69]:
data = data.drop(columns=['store_ID'])

In [70]:
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 624019 entries, 0 to 640839
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   Unnamed: 0           624019 non-null  int64
 1   day_of_week          624019 non-null  int64
 2   nb_customers_on_day  624019 non-null  int64
 3   open                 624019 non-null  int64
 4   promotion            624019 non-null  int64
 5   school_holiday       624019 non-null  int64
 6   sales                624019 non-null  int64
 7   month                624019 non-null  int32
 8   state_holiday_a      624019 non-null  bool 
 9   state_holiday_b      624019 non-null  bool 
 10  state_holiday_c      624019 non-null  bool 
dtypes: bool(3), int32(1), int64(7)
memory usage: 42.3 MB
None
   Unnamed: 0  day_of_week  nb_customers_on_day  open  promotion  \
0      425390            4                  517     1          0   
1      291687            6                  694     1     

Standard Scaler

In [71]:
from sklearn.preprocessing import StandardScaler

# Features and target variable
X = data.drop(columns=['sales'])  # All features except target
y = data['sales']  # Target variable

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Converting back to a DataFrame for better readability
X = pd.DataFrame(X_scaled, columns=X.columns)


In [72]:
from sklearn.model_selection import train_test_split

# Splitting into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")


Training set: (499215, 10), Testing set: (124804, 10)


Linear Regression

In [73]:
from sklearn.linear_model import LinearRegression

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on training data
train_score = model.score(X_train, y_train)
print(f"Training R² Score: {train_score}")


Training R² Score: 0.8489953533023333


In [74]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Metrics for training and testing
train_mse = mean_squared_error(y_train, y_pred_train)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)

print(f"Training MSE: {train_mse}, MAE: {train_mae}")
print(f"Testing MSE: {test_mse}, MAE: {test_mae}")

# R² score
test_r2 = r2_score(y_test, y_pred_test)
print(f"Testing R² Score: {test_r2}")


Training MSE: 1714211.432181141, MAE: 913.8206646812179
Testing MSE: 1704344.1091103111, MAE: 911.7079006864774
Testing R² Score: 0.8501012681170763


In [75]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calcula R²
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"R² (Train): {r2_train}")
print(f"R² (Test): {r2_test}")

R² (Train): 0.8900870863315279
R² (Test): 0.8860851496855391


In [76]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Crear el modelo Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Ajustar el modelo
gb_regressor.fit(X_train, y_train)

# Predicción y evaluación
y_pred = gb_regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse}")


R2 Score: 0.8841517139616388
Mean Squared Error: 1317191.555791273
