In [None]:
### Predicting Sales Revenue for a Retail Store

# Step 1: Data Pre-processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
data = pd.read_csv('retail_sales.csv')  # Replace with actual dataset file

# Display initial dataset information
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data['sales'] = imputer.fit_transform(data[['sales']])

# Separate features and target
X = data.drop('sales', axis=1)
y = data['sales']

# Categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Final preprocessed dataset
X = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Feature Engineering
from sklearn.feature_selection import SelectKBest, f_regression

# Select top 10 features
selector = SelectKBest(score_func=f_regression, k=10)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Step 3: Model Building
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Artificial Neural Network': MLPRegressor(random_state=42, max_iter=500)
}

# Train models
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model

# Step 4: Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = []
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({
        'Model': name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    })

# Display results
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.sort_values(by='R2', ascending=False))

# Conclusion and Best Model
best_model = results_df.sort_values(by='R2', ascending=False).iloc[0]
print(f"\nBest Model: {best_model['Model']} with R2 Score: {best_model['R2']:.2f}")

# Save the best model
import joblib
joblib.dump(trained_models[best_model['Model']], 'best_model.pkl')

# Summary and Future Steps
"""
This notebook illustrates a complete workflow for predicting retail store sales. Future improvements could include:
1. Hyperparameter tuning for optimal performance.
2. Incorporating time-series data if available.
3. Exploring additional ensemble methods for better accuracy.
"""
