In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('sales_data.csv')

# Inspect data
print(data.info())

# Handle missing values (you can drop, fill, or impute missing values)
data.fillna(method='ffill', inplace=True)

# Convert date column to datetime (if applicable)
data['date'] = pd.to_datetime(data['date'])

# Feature extraction - Example: extracting month, day, and year
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data['day'] = data['date'].dt.day

# Remove unnecessary columns
data.drop(['date', 'unnecessary_column'], axis=1, inplace=True)

# Separate features and target
X = data.drop('sales', axis=1)
y = data['sales']



# Calculate moving average for sales (e.g., 7-day moving average)
data['moving_avg'] = data['sales'].rolling(window=7).mean().shift(1)

# Fill any NaN values created by moving average
data['moving_avg'].fillna(data['moving_avg'].mean(), inplace=True)

# Add categorical features like product categories, promotional events
data = pd.get_dummies(data, columns=['product_category', 'promo_event'], drop_first=True)


# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional for models like linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)






In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Train Decision Tree
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_scaled, y_train)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
lr_preds = lr_model.predict(X_test_scaled)
dt_preds = dt_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test_scaled)

# Evaluate models using MAE and RMSE
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse

# Evaluate Linear Regression
lr_mae, lr_rmse = evaluate_model(y_test, lr_preds)
print(f'Linear Regression - MAE: {lr_mae}, RMSE: {lr_rmse}')

# Evaluate Decision Tree
dt_mae, dt_rmse = evaluate_model(y_test, dt_preds)
print(f'Decision Tree - MAE: {dt_mae}, RMSE: {dt_rmse}')

# Evaluate Random Forest
rf_mae, rf_rmse = evaluate_model(y_test, rf_preds)
print(f'Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}')




# Select the model with the lowest MAE or RMSE
best_model = rf_model  # Random Forest selected based on lowest RMSE



# Example new data (for future predictions)
new_data = pd.DataFrame({
    'month': [9], 
    'year': [2024], 
    'day': [15], 
    'moving_avg': [400], 
    'product_category_A': [1], 
    'promo_event_Yes': [0]
})

# Apply the same scaling to the new data
new_data_scaled = scaler.transform(new_data)

# Forecast sales using the best model (Random Forest)
sales_forecast = best_model.predict(new_data_scaled)
print(f'Predicted Sales: {sales_forecast}')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot actual vs predicted sales
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Sales')
plt.plot(rf_preds, label='Predicted Sales (Random Forest)')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.legend()
plt.title('Actual vs Predicted Sales')
plt.show()



# This code covers the full machine learning workflow:

# Data cleaning
# Feature engineering
# Model training and evaluation
# Sales forecasting
