In [1]:
#import the necessary libraries
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Sample CSV data for the prediction of the price of a soap
csv_data = """
Size (g),Raw Material (Special Ingredients),Process,Raw Material Cost,Labor and Machinery Cost,Packaging and Shipping Cost,Brand,Market Demand,Competitor Price,Location,Market Segment,Price
120,"Lavender, Coconut Oil",Handmade,5.5,3.2,1.8,OrganicSoap,High,15,Urban,Luxury,20
80,"Aloe Vera, Olive Oil",Organic,4.2,2.8,1.5,NatureFresh,Medium,12,Suburban,Everyday,12
150,"Charcoal, Tea Tree Oil",Specialty,6.8,4.5,2.3,CharcoPure,Low,18,Rural,Spa,25
100,"Shea Butter, Almond Oil",Handmade,5,3,1.2,HandCrafted,High,14,Urban,Organic,18
130,"Green Tea Extract, Jojoba Oil",Organic,5.8,3.5,1.6,GreenEleg,Medium,13,Suburban,Everyday,15
90,"Chamomile, Sunflower Oil",Handmade,4.5,2.7,1,PureBliss,High,16,Urban,Luxury,22
110,"Rosehip Oil, Oatmeal",Specialty,6,3.8,2,RoseSilk,Medium,17,Suburban,Spa,19
75,"Cucumber Extract, Coconut Oil",Handmade,4,2.5,1.1,FreshGlow,High,15,Urban,Organic,16
140,"Peppermint, Almond Oil",Organic,6.5,4,2.2,MintyFresh,Medium,14,Suburban,Everyday,17
95,"Mango Butter, Avocado Oil",Handmade,4.8,3.1,1.3,ExoticMango,High,16,Urban,Luxury,21
120,"Lemongrass, Argan Oil",Organic,5.7,3.3,1.4,CitrusBurst,Medium,13,Suburban,Spa,14
85,"Lavender, Coconut Oil",Handmade,4.2,2.6,1,LavishScent,High,15,Urban,Organic,18
130,"Charcoal, Tea Tree Oil",Specialty,6.5,4.2,2.1,PureChar,Low,17,Rural,Spa,23
100,"Aloe Vera, Olive Oil",Organic,4.8,3,1.5,AlohaFresh,Medium,13,Suburban,Everyday,15
150,"Shea Butter, Almond Oil",Handmade,7,4.8,2.5,SilkTouch,High,18,Urban,Luxury,25
80,"Green Tea Extract, Jojoba Oil",Organic,4,2.3,1.2,GreenTease,Medium,12,Suburban,Everyday,11
110,"Chamomile, Sunflower Oil",Handmade,5.3,3.5,1.7,CalmEssence,High,16,Urban,Organic,19
120,"Rosehip Oil, Oatmeal",Specialty,6.2,4,2.2,RoseSilk,Medium,17,Suburban,Spa,20
90,"Cucumber Extract, Coconut Oil",Handmade,4.2,2.4,1.1,FreshGlow,High,15,Urban,Organic,16
140,"Peppermint, Almond Oil",Organic,6.8,4.2,2.3,MintyFresh,Medium,14,Suburban,Everyday,18
"""

In [3]:
# Creating DataFrame
df = pd.read_csv(StringIO(csv_data))

In [30]:
# Displaying the sample records of the DataFrame
df.head()

Unnamed: 0,Size (g),Raw Material (Special Ingredients),Process,Raw Material Cost,Labor and Machinery Cost,Packaging and Shipping Cost,Brand,Market Demand,Competitor Price,Location,Market Segment,Price
0,120,"Lavender, Coconut Oil",Handmade,5.5,3.2,1.8,OrganicSoap,High,15,Urban,Luxury,20
1,80,"Aloe Vera, Olive Oil",Organic,4.2,2.8,1.5,NatureFresh,Medium,12,Suburban,Everyday,12
2,150,"Charcoal, Tea Tree Oil",Specialty,6.8,4.5,2.3,CharcoPure,Low,18,Rural,Spa,25
3,100,"Shea Butter, Almond Oil",Handmade,5.0,3.0,1.2,HandCrafted,High,14,Urban,Organic,18
4,130,"Green Tea Extract, Jojoba Oil",Organic,5.8,3.5,1.6,GreenEleg,Medium,13,Suburban,Everyday,15


In [7]:
#Displaying the datatypes of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Size (g)                            20 non-null     int64  
 1   Raw Material (Special Ingredients)  20 non-null     object 
 2   Process                             20 non-null     object 
 3   Raw Material Cost                   20 non-null     float64
 4   Labor and Machinery Cost            20 non-null     float64
 5   Packaging and Shipping Cost         20 non-null     float64
 6   Brand                               20 non-null     object 
 7   Market Demand                       20 non-null     object 
 8   Competitor Price                    20 non-null     int64  
 9   Location                            20 non-null     object 
 10  Market Segment                      20 non-null     object 
 11  Price                               20 non-null

In [8]:
# Extracting the features and target variable
X = df.drop(['Price'], axis = 1)  # Features
y = df['Price']  # Target

In [9]:
# Converting categorical variables into dummy variables
X = pd.get_dummies(X, columns=['Raw Material (Special Ingredients)', 'Process', 'Brand','Market Demand', 'Location', 'Market Segment'])

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize the Linear Regression model
model = LinearRegression()

In [12]:
# Train the model
model.fit(X_train, y_train)

In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [14]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')

Mean Squared Error: 1.1738053807029227
Root Mean Squared Error: 1.083422992511661


In [21]:
# Example data for a new soap
new_soap_data = {
    'Size (g)': [110],
    'Raw Material (Special Ingredients)': ['Green Tea Extract, Olive Oil'],
    'Process': ['Organic'],
    'Raw Material Cost': [5.2],
    'Labor and Machinery Cost': [3.0],
    'Packaging and Shipping Cost': [1.2],
    'Brand': ['AlohaFresh'],
    'Market Demand': ['Medium'],
    'Competitor Price': [14],
    'Location': ['Urban'],
    'Market Segment': ['Everyday']
}

In [22]:
# Creating a DataFrame for the new soap
new_soap_df = pd.DataFrame(new_soap_data)

In [23]:
# Convert categorical variables into dummy/indicator variables
new_soap_df = pd.get_dummies(new_soap_df, columns=['Raw Material (Special Ingredients)','Process','Brand','Market Demand','Location','Market Segment'])

In [25]:
# Make sure the new DataFrame has the same columns as the training data
missing_cols = set(X.columns) - set(new_soap_df.columns)
for col in missing_cols:
    new_soap_df[col] = 0

In [28]:
# Reorder columns to match the order during training
new_soap_df = new_soap_df[X.columns]

In [29]:
# Make predictions using the trained model
predicted_price = model.predict(new_soap_df)

print(f'Predicted Price: {predicted_price[0]}')

Predicted Price: 17.09829651889243
