In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import numpy as np


In [2]:
df = pd.read_csv('data/Orange Quality Data.csv')

In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_features

Index(['Size (cm)', 'Weight (g)', 'Brix (Sweetness)', 'pH (Acidity)',
       'Softness (1-5)', 'HarvestTime (days)', 'Ripeness (1-5)'],
      dtype='object')

In [5]:
categorical_features = X.select_dtypes(include=['object']).columns
categorical_features

Index(['Color', 'Variety', 'Blemishes (Y/N)'], dtype='object')

In [6]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)

print("Predictions:", predictions)
print("Test labels:", y_test.values)


Predictions: [4.36682253 4.09517099 1.93914779 3.55561946 3.92786032 4.36445797
 3.87469072 2.01584914 5.08790838 4.62894439 5.37433877 4.49259798
 4.19491198 4.80908302 3.70038073 3.55825226 3.7578335  5.06402447
 3.46138835 4.89937805 3.49817118 5.26763789 2.2927594  4.02712698
 4.24620824 3.14388071 2.15777107 4.21368677 4.17203946 3.58137194
 1.84441329 2.2262547  3.68754197 5.02862293 2.71474996 3.15239706
 5.35128518 4.49030272 4.36978104 4.98446739 4.76369641 4.12608729
 4.19497776 2.31674647 4.68967068 3.87394666 2.4078695  4.15317408
 2.67617484]
Test labels: [4.5 4.  2.5 4.  5.  4.  4.  3.  5.  4.  5.  5.  4.  4.5 4.  3.5 4.  5.
 4.  5.  4.  4.5 2.  4.5 4.  3.5 2.5 4.5 5.  4.  2.  4.  4.  5.  2.  3.
 5.  3.5 4.  5.  5.  3.  5.  3.5 5.  3.5 3.  4.5 3.5]


In [7]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 0.5784471525153437
