<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [1]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
from sklearn.model_selection import cross_validate
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading Store Dataset</h2>

In [2]:
df = pd.read_csv('Store_CA.csv')

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 1650 rows and 12 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")


All Columns:
1. ProductVariety
2. MarketingSpend
3. CustomerFootfall
4. StoreSize
5. EmployeeEfficiency
6. StoreAge
7. CompetitorDistance
8. PromotionsCount
9. EconomicIndicator
10. StoreLocation
11. StoreCategory
12. MonthlySalesRevenue


<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:
numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. ProductVariety (Unique Values: 574)
  2. MarketingSpend (Unique Values: 46)
  3. CustomerFootfall (Unique Values: 1100)
  4. StoreSize (Unique Values: 444)
  5. EmployeeEfficiency (Unique Values: 437)
  6. StoreAge (Unique Values: 30)
  7. CompetitorDistance (Unique Values: 20)
  8. PromotionsCount (Unique Values: 6)
  9. EconomicIndicator (Unique Values: 554)
  10. MonthlySalesRevenue (Unique Values: 1589)

Non-Numeric Columns:
  1. StoreLocation (Unique Values: 4)
  2. StoreCategory (Unique Values: 3)



<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
df.head(5)

Unnamed: 0,ProductVariety,MarketingSpend,CustomerFootfall,StoreSize,EmployeeEfficiency,StoreAge,CompetitorDistance,PromotionsCount,EconomicIndicator,StoreLocation,StoreCategory,MonthlySalesRevenue
0,581,29,1723,186,84.9,1,12,6,108.3,Los Angeles,Electronics,284.9
1,382,31,1218,427,75.8,18,11,6,97.8,Los Angeles,Electronics,308.21
2,449,35,2654,142,92.8,14,11,6,101.1,Los Angeles,Grocery,292.11
3,666,9,2591,159,66.3,11,11,4,115.1,Sacramento,Clothing,279.61
4,657,35,2151,275,89.1,28,12,7,93.4,Palo Alto,Electronics,359.71


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 0


<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [8]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 0 duplicate rows in the dataset.


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [9]:
pd.set_option('display.float_format', '{:.3f}'.format)
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,ProductVariety,MarketingSpend,CustomerFootfall,StoreSize,EmployeeEfficiency,StoreAge,CompetitorDistance,PromotionsCount,EconomicIndicator,MonthlySalesRevenue
count,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0
mean,500.916,27.46,2020.028,272.998,72.199,15.448,9.934,5.501,99.762,299.253
std,148.054,13.015,484.026,131.049,13.02,8.668,3.117,1.388,14.615,65.544
min,100.0,5.0,500.0,50.0,50.0,1.0,1.0,3.0,60.0,106.71
25%,396.0,17.0,1691.0,156.0,61.0,8.0,8.0,4.0,90.0,254.125
50%,500.5,27.0,2021.0,273.0,72.1,16.0,10.0,5.0,100.3,297.44
75%,602.75,38.0,2350.75,385.5,83.1,23.0,12.0,7.0,109.6,344.222
max,1092.0,50.0,3627.0,500.0,94.9,30.0,20.0,8.0,140.0,534.26


<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [10]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,StoreLocation,StoreCategory
count,1650,1650
unique,4,3
top,Los Angeles,Grocery
freq,417,587


<h2 style="color: purple;">2.10 Displaying Frequency of Categorical Data</h2> 

In [11]:
for col in non_numeric_column_names:
    # Get value counts
    value_counts = df[col].value_counts()

    # Display with a numbered list
    print(f"\nValue counts for {col}:")
    for idx, (value, count) in enumerate(value_counts.items(), start=1):
        print(f"{idx}. {value}: {count}")
    print("-" * 50)



Value counts for StoreLocation:
1. Los Angeles: 417
2. San Francisco: 414
3. Sacramento: 412
4. Palo Alto: 407
--------------------------------------------------

Value counts for StoreCategory:
1. Grocery: 587
2. Electronics: 535
3. Clothing: 528
--------------------------------------------------


<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 One-Hot Encoding Categorical Columns</h2> 

In [12]:
df_encoded = pd.get_dummies(df, columns=['StoreLocation', 'StoreCategory'], dtype=int)

<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>

In [13]:
X = df_encoded.drop(columns=['MonthlySalesRevenue'])
y = df_encoded['MonthlySalesRevenue']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [16]:
model_list = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
    AdaBoostRegressor(random_state=42),
    SVR(),
    GaussianProcessRegressor(),
    KNeighborsRegressor(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mse)
    results.append((model.__class__.__name__, r2, mae, mse, rmse))

# Display results
import pandas as pd
df_results = pd.DataFrame(results, columns=["Model", "R2", "MAE", "MSE", "RMSE"])
df_results = df_results.sort_values(by="R2", ascending=False)
print(df_results)

                       Model     R2     MAE       MSE    RMSE
0           LinearRegression  0.808  24.881   832.532  28.854
4          AdaBoostRegressor  0.785  25.752   930.746  30.508
3  GradientBoostingRegressor  0.784  25.786   935.094  30.579
2      RandomForestRegressor  0.779  25.992   958.801  30.965
1      DecisionTreeRegressor  0.550  35.515  1948.580  44.143
7        KNeighborsRegressor  0.513  36.970  2108.984  45.924
5                        SVR  0.345  42.281  2838.803  53.280
6   GaussianProcessRegressor -8.773 185.969 42354.027 205.801


In [17]:
best_model_name = df_results.iloc[0]["Model"]
print(f"Best model based on R2: {best_model_name}")

Best model based on R2: LinearRegression


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Optional: split again for manual validation
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Only available in newer versions of scikit-learn (>=0.24)
}

best_score = -np.inf
best_params = None

from itertools import product

for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = LinearRegression(**params)
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    score = r2_score(y_val, preds)
    if score > best_score:
        best_score = score
        best_params = params

print("Best hyperparameters for LinearRegression:", best_params)

Best hyperparameters for LinearRegression: {'fit_intercept': True, 'positive': False}


<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [20]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
import numpy as np

# Best parameters you found earlier
best_params = {
    'fit_intercept': True,
    'positive': False
}

# Initialize final model
final_model = LinearRegression(**best_params)

# Define scoring metrics
scoring = {
    'r2': 'r2',
    'neg_mae': 'neg_mean_absolute_error',
    'neg_mse': 'neg_mean_squared_error'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract scores
r2_scores = cv_results['test_r2']
mae_scores = -cv_results['test_neg_mae']
mse_scores = -cv_results['test_neg_mse']
rmse_scores = np.sqrt(mse_scores)

# Print individual and mean scores
print("CV R² scores:", r2_scores)
print("Mean CV R²:", np.mean(r2_scores))

print("CV MAE scores:", mae_scores)
print("Mean CV MAE:", np.mean(mae_scores))

print("CV MSE scores:", mse_scores)
print("Mean CV MSE:", np.mean(mse_scores))

print("CV RMSE scores:", rmse_scores)
print("Mean CV RMSE:", np.mean(rmse_scores))


CV R² scores: [0.8276147  0.80306551 0.80480039 0.7877852  0.79513434]
Mean CV R²: 0.8036800285857855
CV MAE scores: [23.83189616 24.9902534  25.7552691  26.24053829 24.0251465 ]
Mean CV MAE: 24.96862069027153
CV MSE scores: [756.03469031 830.27177327 899.24837728 913.40151163 774.95032312]
Mean CV MSE: 834.7813351203079
CV RMSE scores: [27.496085   28.81443689 29.98747034 30.22253318 27.83792958]
Mean CV RMSE: 28.871690996205366


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [22]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)
# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)
def regression_report(y_true, y_pred):
    print("Regression Report:")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_true, y_pred):.4f}")
    
# Usage
regression_report(y_test, y_test_preds)

Regression Report:
MAE:  24.8813
MSE:  832.5323
RMSE: 28.8536
R²:   0.8079
