In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
data = pd.read_csv("ProductDemand.csv")

# Feature Engineering (Placeholder: Replace with your actual feature engineering)
# Create relevant features
data['Feature1'] = data['Total Price'] / data['Base Price']
data['Feature2'] = data['Units Sold'] * data['Base Price']

# Time Series Decomposition
# Decompose the time series
result = seasonal_decompose(data['Units Sold'], model='additive', period=12)

# Clustering (Placeholder: Replace with your actual clustering features)
# Determine the optimal number of clusters using KMeans and silhouette score
X_cluster = data[['Feature1', 'Feature2']]  # Specify your clustering features
range_clusters = range(2, 10)  # Define the range of clusters to try
best_silhouette = -1
best_num_clusters = 2
for num_clusters in range_clusters:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X_cluster)
    silhouette_avg = silhouette_score(X_cluster, cluster_labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_num_clusters = num_clusters

kmeans = KMeans(n_clusters=best_num_clusters, random_state=0)
data['Cluster'] = kmeans.fit_predict(X_cluster)

# Regression Model
X_reg = data[['Total Price', 'Base Price']]
y_reg = data['Units Sold']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

reg_model = RandomForestRegressor(n_estimators=100, random_state=0)
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)
reg_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

# Classification Model
X_class = data[['Total Price', 'Base Price']]
y_class = data['Store ID']  # Replace with your actual classification target
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train_class = scaler.fit_transform(X_train_class)
X_test_class = scaler.transform(X_test_class)

class_model = RandomForestClassifier(n_estimators=100, random_state=0)
class_model.fit(X_train_class, y_train_class)
y_pred_class = class_model.predict(X_test_class)
class_report = classification_report(y_test_class, y_pred_class)

# ARIMA Time Series Forecasting
# You need to specify the ARIMA order (p, d, q) based on your data characteristics
p, d, q = 1, 1, 1  # Modify these values
model = ARIMA(data['Units Sold'], order=(p, d, q))
model_fit = model.fit()
forecast_periods = 10  # Adjust as needed
forecast = model_fit.forecast(steps=forecast_periods)

# Output results
print(f"Regression RMSE: {reg_rmse}")
print(f"Classification Report:\n{class_report}")
print(f"Time Series Forecast: {forecast}")



FileNotFoundError: ignored

In [28]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Step 1: Load and preprocess the dataset
data = pd.read_csv("/content/SAMPLE_new_file.csv")

# Preprocess data (handle missing values, feature extraction, etc.)
# Handle missing values (replace with your specific strategy)
data.fillna(0, inplace=True)  # Replace missing values with zeros

# Feature Engineering
# Add your specific feature engineering logic here
data['Feature3'] = data['Total Price'] - data['Base Price']
data['Feature4'] = data['Units Sold'] / (data['Base Price'] + 1)  # Adding 1 to avoid division by zero

# Additional Preprocessing
# Feature scaling (e.g., StandardScaler for numerical features)
numerical_features = ['Feature3', 'Feature4']
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# One-hot encoding (if you have categorical features)
categorical_features = ['Store ID']
encoder = OneHotEncoder(sparse=False)  # Use sparse=False to get a dense array
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_features)

# Create a DataFrame from the encoded features
encoded_features_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Drop the original categorical columns and concatenate the encoded features
data = data.drop(categorical_features, axis=1)
data = pd.concat([data, encoded_features_df], axis=1)

# Time Series Decomposition
# Decompose the time series
result = seasonal_decompose(data['Units Sold'], model='additive', period=12)
if 'Store ID' in data:
    data['Store ID'] = data['Store ID'].astype('category')

# Check if 'Promotion' column exists before creating 'Promotion Flag'
if 'Promotion' in data:
    data['Promotion Flag'] = data['Promotion'].apply(lambda x: 1 if x == 'Promotion' else 0)

# Clustering
# Add your specific clustering logic here
clustering_features = data[['Total Price', 'Base Price']]
range_clusters = range(2, 11)

# Perform clustering using K-Means
X = clustering_features  # Use your selected clustering features
best_silhouette = -1
best_num_clusters = 2

for num_clusters in range_clusters:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_num_clusters = num_clusters

kmeans = KMeans(n_clusters=best_num_clusters, random_state=0)
data['Cluster'] = kmeans.fit_predict(X)

# Step 2: Feature engineering (Add your specific feature engineering logic here)
# For example:
data['Feature5'] = data['Total Price'] / data['Units Sold']

# Step 3: Model training for Regression
X_reg = data[['Feature3', 'Feature4', 'Feature5']]  # Features for regression
y_reg = data['Demand']  # Target variable
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

# Standardize the data
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

# Train a Random Forest Regression model
reg_model = RandomForestRegressor(n_estimators=100, random_state=0)
reg_model.fit(X_train_reg, y_train_reg)

# Predict demand using the regression model
y_pred_reg = reg_model.predict(X_test_reg)
reg_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

# Step 4: Model training for Classification
X_class = data[['Feature3', 'Feature4', 'Feature5']]  # Features for classification
y_class = data['Category']  # Target variable
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=0)

# Standardize the data
scaler_class = StandardScaler()
X_train_class = scaler_class.fit_transform(X_train_class)
X_test_class = scaler_class.transform(X_test_class)

# Train a Random Forest Classifier model
class_model = RandomForestClassifier(n_estimators=100, random_state=0)
class_model.fit(X_train_class, y_train_class)

# Predict categories using the classification model
y_pred_class = class_model.predict(X_test_class)
class_report = classification_report(y_test_class, y_pred_class)

# Output results
print(f"Regression RMSE: {reg_rmse}")
print(f"Classification Report:\n{class_report}")




KeyboardInterrupt: ignored

In [29]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Step 1: Load and preprocess the dataset
data = pd.read_csv("/content/SAMPLE_new_file.csv")

# Preprocess data (handle missing values, feature extraction, etc.)
data.fillna(0, inplace=True)  # Replace missing values with zeros

# Feature Engineering
data['Feature3'] = data['Total Price'] - data['Base Price']
data['Feature4'] = data['Units Sold'] / (data['Base Price'] + 1)

# Clustering
clustering_features = data[['Total Price', 'Base Price']]
range_clusters = range(2, 11)

# Perform clustering using K-Means
X = clustering_features
best_silhouette = -1
best_num_clusters = 2

for num_clusters in range_clusters:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_num_clusters = num_clusters

kmeans = KMeans(n_clusters=best_num_clusters, random_state=0)
data['Cluster'] = kmeans.fit_predict(X)

# Step 2: Feature engineering (Add your specific feature engineering logic here)
data['Feature5'] = data['Total Price'] / data['Units Sold']

# Step 3: Model training for Regression
X_reg = data[['Feature3', 'Feature4', 'Feature5']]
y_reg = data['Demand']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

reg_model = RandomForestRegressor(n_estimators=100, random_state=0)
reg_model.fit(X_train_reg, y_train_reg)

# Predict demand using the regression model
y_pred_reg = reg_model.predict(X_test_reg)
reg_rmse = mean_squared_error(y_test_reg, y_pred_reg)

# Step 4: Model training for Classification
X_class = data[['Feature3', 'Feature4', 'Feature5']]
y_class = data['Category']
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=0)

scaler_class = StandardScaler()
X_train_class = scaler_class.fit_transform(X_train_class)
X_test_class = scaler_class.transform(X_test_class)

class_model = RandomForestClassifier(n_estimators=100, random_state=0)
class_model.fit(X_train_class, y_train_class)

# Predict categories using the classification model
y_pred_class = class_model.predict(X_test_class)
class_report = classification_report(y_test_class, y_pred_class)

# Output results
print(f"Regression RMSE: {reg_rmse}")
print(f"Classification Report:\n{class_report}")




KeyboardInterrupt: ignored

In [27]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'your_file.csv'  # Replace with your file path
df = pd.read_csv("/content/PoductDemand.csv")

# Define the row number to keep
row_number_to_keep = 1652

# Delete rows above the specified row number
df = df.iloc[row_number_to_keep:]

# Save the modified DataFrame back to a new CSV file
new_file_path = 'SAMPLE_new_file.csv'  # Replace with the desired output file path
df.to_csv(new_file_path, index=False)


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.cluster import MiniBatchKMeans  # Use MiniBatchKMeans for parallel processing
from sklearn.metrics import silhouette_score

# Step 1: Load and preprocess the dataset
data = pd.read_csv("/content/SAMPLE_new_file.csv")

# Preprocess data (handle missing values, feature extraction, etc.)
data.fillna(0, inplace=True)  # Replace missing values with zeros

# Feature Engineering
data['Feature3'] = data['Total Price'] - data['Base Price']
data['Feature4'] = data['Units Sold'] / (data['Base Price'] + 1)

# Clustering
clustering_features = data[['Total Price', 'Base Price']]
range_clusters = range(2, 11)

# Perform clustering using MiniBatchKMeans for parallel processing
X = clustering_features
best_silhouette = -1
best_num_clusters = 2

for num_clusters in range_clusters:
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_num_clusters = num_clusters

kmeans = MiniBatchKMeans(n_clusters=best_num_clusters, random_state=0)
data['Cluster'] = kmeans.fit_predict(X)

# Step 2: Feature engineering (Add your specific feature engineering logic here)
data['Feature5'] = data['Total Price'] / data['Units Sold']

# Step 3: Model training for Regression
X_reg = data[['Feature3', 'Feature4', 'Feature5']]
y_reg = data['Demand']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

reg_model = RandomForestRegressor(n_estimators=100, random_state=0)
reg_model.fit(X_train_reg, y_train_reg)

# Predict demand using the regression model
y_pred_reg = reg_model.predict(X_test_reg)
reg_rmse = mean_squared_error(y_test_reg, y_pred_reg)

# Step 4: Model training for Classification
X_class = data[['Feature3', 'Feature4', 'Feature5']]
y_class = data['Category']
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=0)

scaler_class = StandardScaler()
X_train_class = scaler_class.fit_transform(X_train_class)
X_test_class = scaler_class.transform(X_test_class)

class_model = RandomForestClassifier(n_estimators=100, random_state=0)
class_model.fit(X_train_class, y_train_class)

# Predict categories using the classification model
y_pred_class = class_model.predict(X_test_class)
class_report = classification_report(y_test_class, y_pred_class)

# Output results
print(f"Regression RMSE: {reg_rmse}")
print(f"Classification Report:\n{class_report}")




# New Section