In [29]:
import pandas as pd
import numpy as np

In [30]:
df=pd.read_excel("Wheat_cleaned_dataset.xlsx")

In [31]:
df.shape

(174, 5)

In [32]:
df.head()

Unnamed: 0,year,region,Gross_Value,Total_Cost,Profit
0,1998,Basin and Range,318.24,168.16,150.08
1,1998,Fruitful Rim,347.0,216.8,130.2
2,1998,Heartland,279.1,150.38,128.72
3,1998,Mississippi Portal,261.02,157.5,103.52
4,1998,Northern Crescent,384.56,148.92,235.64


In [33]:
num_features = ['year', 'Gross_Value', 'Total_Cost']
onehot_columns = ['region']


In [34]:
df['region'].nunique()


8

In [35]:
## Indpendent and dependent features
from sklearn.model_selection import train_test_split
X = df.drop(['Profit'], axis=1)
y = df['Profit']

In [36]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ],
    remainder='passthrough'
)


In [37]:
df_ml=df.copy()

In [38]:
df_ml.head()

Unnamed: 0,year,region,Gross_Value,Total_Cost,Profit
0,1998,Basin and Range,318.24,168.16,150.08
1,1998,Fruitful Rim,347.0,216.8,130.2
2,1998,Heartland,279.1,150.38,128.72
3,1998,Mississippi Portal,261.02,157.5,103.52
4,1998,Northern Crescent,384.56,148.92,235.64


In [39]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((139, 4), (35, 4))

In [40]:
X_train

Unnamed: 0,year,region,Gross_Value,Total_Cost
118,2015,Northern Great Plains,429.82,228.90
142,2019,Northern Great Plains,501.88,264.52
163,2023,Fruitful Rim,700.90,389.38
38,2002,Prairie Gateway,136.54,98.78
139,2019,Fruitful Rim,796.30,415.14
...,...,...,...,...
71,2007,Prairie Gateway,302.64,172.84
106,2013,Northern Great Plains,596.14,246.74
14,1999,Prairie Gateway,167.92,92.82
92,2011,Heartland,774.24,351.28


In [41]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)


In [42]:
pd.DataFrame(X_train_trans)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.550513,-0.514688,-0.373984
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.052293,-0.257776,-0.040320
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.554072,0.451781,1.129284
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.080269,-1.560305,-1.592860
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.052293,0.791906,1.370586
...,...,...,...,...,...,...,...,...,...,...
134,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.453045,-0.968116,-0.899116
135,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.299624,0.078285,-0.206871
136,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.456604,-1.448427,-1.648689
137,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.048734,0.713256,0.772389


## MODEL TRAINING & SELECTION


In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [44]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [45]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
}

for name, model in models.items():

    # Train model on TRANSFORMED data
    model.fit(X_train_trans, y_train)

    # Make predictions on TRANSFORMED data
    y_train_pred = model.predict(X_train_trans)
    y_test_pred = model.predict(X_test_trans)

    # Evaluate
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(name)
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    print('='*35, '\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000

Lasso
Model performance for Training set
- Root Mean Squared Error: 5.2612
- Mean Absolute Error: 4.0788
- R2 Score: 0.9992
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.1865
- Mean Absolute Error: 3.9837
- R2 Score: 0.9981

Ridge
Model performance for Training set
- Root Mean Squared Error: 7.4451
- Mean Absolute Error: 5.6310
- R2 Score: 0.9984
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 7.1514
- Mean Absolute Error: 5.7492
- R2 Score: 0.9964

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 10.6867
- Mean Absolute Error: 8.1757
- R2 Score: 0.9967
-----------------------

In [46]:
import numpy as np

alpha_values = np.logspace(-4, 1, 20)  
# 20 values from 0.0001 to 10


In [47]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': alpha_values
}

lasso = Lasso()

grid = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)

grid.fit(X_train_trans, y_train)


In [48]:
best_alpha = grid.best_params_['alpha']
print("Best alpha:", best_alpha)


Best alpha: 0.0069519279617756054


In [49]:
best_lasso = Lasso(alpha=best_alpha)
best_lasso.fit(X_train_trans, y_train)


In [50]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_pred = best_lasso.predict(X_test_trans)

print("Tuned R2:", r2_score(y_test, y_pred))
print("Tuned MAE:", mean_absolute_error(y_test, y_pred))
print("Tuned RMSE:", mean_squared_error(y_test, y_pred, squared=False))


Tuned R2: 0.9999999934484329
Tuned MAE: 0.00790719075681352
Tuned RMSE: 0.009627688218111568




In [51]:
X_full_trans = preprocessor.fit_transform(X)   # transform full dataset
best_lasso.fit(X_full_trans, y)


In [52]:
df['Predicted_Profit'] = best_lasso.predict(X_full_trans)
df.to_excel("Wheat_lasso_tuned_predictions.xlsx", index=False)


In [53]:
df[['Profit', 'Predicted_Profit']].head(10)


Unnamed: 0,Profit,Predicted_Profit
0,150.08,150.074014
1,130.2,130.201942
2,128.72,128.712621
3,103.52,103.514953
4,235.64,235.626795
5,102.14,102.127107
6,116.14,116.125562
7,37.72,37.723924
8,152.84,152.832401
9,153.62,153.619429


In [54]:
df['Error'] = df['Predicted_Profit'] - df['Profit']
df[['Profit','Predicted_Profit','Error']].head()


Unnamed: 0,Profit,Predicted_Profit,Error
0,150.08,150.074014,-0.005986
1,130.2,130.201942,0.001942
2,128.72,128.712621,-0.007379
3,103.52,103.514953,-0.005047
4,235.64,235.626795,-0.013205
