In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
 
# Load dataset
dataset = pd.read_csv('c:\\ML\\retail_price.csv')
 
# Handle missing values if any
dataset.dropna(inplace=True) # Drop rows with missing values
 
# Feature selection
x = dataset[['product_id', 'product_category_name', 'freight_price', 'product_weight_g', 'product_score', 'customers', 's']]
y = dataset['unit_price']
 
# Encode categorical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
 
# Identify categorical columns
categorical_columns = [0, 1] # Indices of categorical columns in X
 
# Construct ColumnTransformer
transformers = [('encoder', OneHotEncoder(), categorical_columns),
               ('scaler', StandardScaler(), [2, 3, 4, 5, 6])]  # Scale numeric columns, encode categorical columns
ct = ColumnTransformer(transformers=transformers)
x_encoded = np.array(ct.fit_transform(x).toarray())
 
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_encoded, y_encoded, test_size=0.25, random_state=10)
 
# Polynomial features
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)
 
# Initialize Polynomial Regression model
regressor = LinearRegression()
 
# Train the model
regressor.fit(x_train_poly, y_train)
 
# Predict unit prices for the test data
y_pred = regressor.predict(x_test_poly)
 
# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)
print("R-squared Score for Polynomial Regression Model:", r_squared)
 
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
 
# Convert the R-squared score to percentage of variance explained
percentage_of_variance_explained = r_squared * 100
print("Accuracy (Percentage of Variance Explained):", percentage_of_variance_explained)
 
# Create new data for prediction
new_data = pd.DataFrame({'product_id': ['furniture1'], 
                       'product_category_name': ['furniture_decor'], 
                       'freight_price': [12.0], 
                       'product_weight_g': [850], 
                       'product_score': [3.7], 
                       'customers': [14], 
                       's': [13.00]})
 
# Encode new data
new_data_encoded = ct.transform(new_data)
new_data_poly = poly.transform(new_data_encoded)
 
# Predict unit prices for new data
new_data_pred = regressor.predict(new_data_poly)
print("Predicted Prices for New Data:", new_data_pred)

R-squared Score for Polynomial Regression Model: 0.8938935501651222
Mean Squared Error (MSE): 605.9701790593388
Accuracy (Percentage of Variance Explained): 89.38935501651221
Predicted Prices for New Data: [19.28735352]


In [3]:
data_1 = pd.DataFrame({
    'product_id': ['garden6'],
    'product_category_name': ['garden_tools'],
    'freight_price': [16.62642857],
    'product_weight_g': [1550],
    'product_score': [4.1],
    'customers': [74],
    's': [3.978539576],
})

data_2 = pd.DataFrame({
    'product_id': ['health2'],
    'product_category_name': ['health_beauty'],
    'freight_price': [19.52363636],
    'product_weight_g': [400],
    'product_score': [4.2],
    'customers': [89],
    's': [29.72972973],
})

data_3 = pd.DataFrame({
    'product_id': ['watches4'],
    'product_category_name': ['watches_gifts'],
    'freight_price': [18.25363636],
    'product_weight_g': [1000],
    'product_score': [4.2],
    'customers': [90],
    's': [16.58291457],
})

data_4 = pd.DataFrame({
    'product_id': ['computers4'],
    'product_category_name': ['computers_accessories'],
    'freight_price': [36.46],
    'product_weight_g': [6550],
    'product_score': [4.2],
    'customers': [15],
    's': [5.68664966],
})

data_5 = pd.DataFrame({
    'product_id': ['garden10'],
    'product_category_name': ['garden_tools'],
    'freight_price': [36.442],
    'product_weight_g': [1750],
    'product_score': [4.2],
    'customers': [46],
    's': [9.071360306],
})

# List of mock data frames
mock_data_frames = [data_1, data_2, data_3, data_4, data_5]

# Initialize list to store predicted unit prices for all 5 data frames
predicted_unit_prices = []

# Loop over each mock data frame
for i, mock_data_frame in enumerate(mock_data_frames):
    # Encode new data
    new_data_encoded = ct.transform(mock_data_frame)
    new_data_encoded = poly.transform(new_data_encoded)
    
    # Predict unit prices for new data
    new_data_pred = regressor.predict(new_data_encoded)
    
    # Print predicted unit prices for each data frame
    print(f"Predicted Unit Prices for data_{i+1}:", new_data_pred)
    
    # Store predicted unit prices
    predicted_unit_prices.append(new_data_pred)

# Convert to numpy array
predicted_unit_prices = np.array(predicted_unit_prices)

Predicted Unit Prices for data_1: [49.91168213]
Predicted Unit Prices for data_2: [267.71337891]
Predicted Unit Prices for data_3: [137.44799805]
Predicted Unit Prices for data_4: [149.82189941]
Predicted Unit Prices for data_5: [79.42993164]
