In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
dataset = pd.read_csv('D:\\SLIIT\\Year 4\\Semester 1\\ML\\Assignment\\retail_price.csv')

# Handle missing values if any
dataset.dropna(inplace=True)  # Drop rows with missing values

# Feature selection
X = dataset[['product_id', 'product_category_name', 'freight_price', 'product_weight_g', 'product_score', 'customers', 's']]
y = dataset['unit_price']

# Identify categorical columns
categorical_columns = [0, 1]  # Indices of categorical columns in X

# Construct ColumnTransformer
transformers = [('encoder', OneHotEncoder(), categorical_columns), 
                ('scaler', StandardScaler(), [2, 3, 4, 5, 6])]  # Scale numeric columns, encode categorical columns
ct = ColumnTransformer(transformers=transformers)
X_encoded = ct.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=10)

# Initialize Multiple Linear Regression model
multiple_linear_regressor = LinearRegression()

# Train the model
multiple_linear_regressor.fit(X_train, y_train)

# Predict unit prices for the test data
y_pred = multiple_linear_regressor.predict(X_test)

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)
print("R2 Score for Linear Regression Model:", r_squared)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Create new data for prediction
new_data = pd.DataFrame({'product_id': ['garden5'], 
                         'product_category_name': ['garden_tools'], 
                         'freight_price': [35.0], 
                         'product_weight_g': [9000], 
                         'product_score': [4.1], 
                         'customers': [20], 
                         's': [8.00]})

# Encode new data
new_data_encoded = ct.transform(new_data)

# Predict unit prices for new data
new_data_pred = multiple_linear_regressor.predict(new_data_encoded)
print("Predicted Unit Prices for New Data:", new_data_pred)


R2 Score for Linear Regression Model: 0.9699976692613469
Mean Squared Error (MSE): 134.3048774557648
Predicted Unit Prices for New Data: [101.78921538]


In [4]:
data_1 = pd.DataFrame({
    'product_id': ['garden6'],
    'product_category_name': ['garden_tools'],
    'freight_price': [16.62642857],
    'product_weight_g': [1550],
    'product_score': [4.1],
    'customers': [74],
    's': [3.978539576],
})

data_2 = pd.DataFrame({
    'product_id': ['health2'],
    'product_category_name': ['health_beauty'],
    'freight_price': [19.52363636],
    'product_weight_g': [400],
    'product_score': [4.2],
    'customers': [89],
    's': [29.72972973],
})

data_3 = pd.DataFrame({
    'product_id': ['watches4'],
    'product_category_name': ['watches_gifts'],
    'freight_price': [18.25363636],
    'product_weight_g': [1000],
    'product_score': [4.2],
    'customers': [90],
    's': [16.58291457],
})

data_4 = pd.DataFrame({
    'product_id': ['computers4'],
    'product_category_name': ['computers_accessories'],
    'freight_price': [36.46],
    'product_weight_g': [6550],
    'product_score': [4.2],
    'customers': [15],
    's': [5.68664966],
})

data_5 = pd.DataFrame({
    'product_id': ['garden10'],
    'product_category_name': ['garden_tools'],
    'freight_price': [36.442],
    'product_weight_g': [1750],
    'product_score': [4.2],
    'customers': [46],
    's': [9.071360306],
})

# List of mock data frames
mock_data_frames = [data_1, data_2, data_3, data_4, data_5]

# Initialize list to store predicted unit prices for all 5 data frames
predicted_unit_prices = []

# Loop over each mock data frame
for i, mock_data_frame in enumerate(mock_data_frames):
    # Encode new data
    new_data_encoded = ct.transform(mock_data_frame)
    
    # Predict unit prices for new data
    new_data_pred = multiple_linear_regressor.predict(new_data_encoded)
    
    # Print predicted unit prices for each data frame
    print(f"Predicted Unit Prices for data_{i+1}:", new_data_pred)
    
    # Store predicted unit prices
    predicted_unit_prices.append(new_data_pred)

# Convert to numpy array
predicted_unit_prices = np.array(predicted_unit_prices)

Predicted Unit Prices for data_1: [55.04072969]
Predicted Unit Prices for data_2: [328.72523023]
Predicted Unit Prices for data_3: [108.77589145]
Predicted Unit Prices for data_4: [142.55834857]
Predicted Unit Prices for data_5: [60.06973532]
