# Importing the Necessary Packages/Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Importing the Necessary Algorithms

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression, GammaRegressor, HuberRegressor, PassiveAggressiveRegressor, PoissonRegressor
from sklearn.linear_model import QuantileRegressor, RANSACRegressor, TheilSenRegressor, TweedieRegressor
from sklearn.ensemble import  AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [3]:
le = LabelEncoder()

# Importing the Dataset for the project

In [4]:
df = pd.read_excel("potato.xlsx")

In [5]:
df.head(5)

Unnamed: 0,location,soilType,sweetPotato,varietalDescription,varietalCode,fertilizerRegime,fertilizerCode,reps,branchNumber,lengthMainStem(cm),...,weightAdventitousRoots(Kg),dryBiomass(Kg),weightSmallStorageRoots(Kg),weightMediumStorageRoots(Kg),weightBigStorageRoots(Kg),totalWeightStorageRoots(Kg),weightMarketableStorageRoots(Kg),totalNumberStorageRoots,totalYield(T/ha),marketableYield(T/ha)
0,Nkolbisson,Nitisol,WFSP,White-fleshed sweet potato,1,No fertilizer,Control,Rep1,2.0,300.06,...,0.18,1.32,1.25,0.83,0.83,2.91,1.66,31,4.85,2.77
1,Nkolbisson,Nitisol,WFSP,White-fleshed sweet potato,1,No fertilizer,Control,Rep2,2.0,304.2,...,0.18,1.29,1.33,0.9,0.74,2.97,1.64,24,4.95,2.73
2,Nkolbisson,Nitisol,WFSP,White-fleshed sweet potato,1,No fertilizer,Control,Rep3,2.5,303.04,...,0.18,1.26,1.44,0.91,0.66,3.01,1.57,30,5.02,2.62
3,Nkolbisson,Nitisol,WFSP,White-fleshed sweet potato,1,Fast compost,FC,Rep1,3.0,185.67,...,0.12,1.53,2.07,2.41,1.07,5.55,3.48,39,9.25,5.8
4,Nkolbisson,Nitisol,WFSP,White-fleshed sweet potato,1,Fast compost,FC,Rep2,3.0,187.2,...,0.12,1.55,2.17,2.22,1.12,5.51,3.34,27,9.18,5.57


# Getting rid of Noises in the dataset

In [6]:
df.drop(["sweetPotato", "varietalDescription", "fertilizerRegime", "weightSmallStorageRoots(Kg)", "weightMediumStorageRoots(Kg)", "weightBigStorageRoots(Kg)"], axis = "columns", inplace = True)

In [7]:
df.head()

Unnamed: 0,location,soilType,varietalCode,fertilizerCode,reps,branchNumber,lengthMainStem(cm),leafAreaIndex,petoileLength(cm),weightAdventitousRoots(Kg),dryBiomass(Kg),totalWeightStorageRoots(Kg),weightMarketableStorageRoots(Kg),totalNumberStorageRoots,totalYield(T/ha),marketableYield(T/ha)
0,Nkolbisson,Nitisol,1,Control,Rep1,2.0,300.06,1.99,28.04,0.18,1.32,2.91,1.66,31,4.85,2.77
1,Nkolbisson,Nitisol,1,Control,Rep2,2.0,304.2,2.0,28.02,0.18,1.29,2.97,1.64,24,4.95,2.73
2,Nkolbisson,Nitisol,1,Control,Rep3,2.5,303.04,2.0,28.01,0.18,1.26,3.01,1.57,30,5.02,2.62
3,Nkolbisson,Nitisol,1,FC,Rep1,3.0,185.67,4.45,31.91,0.12,1.53,5.55,3.48,39,9.25,5.8
4,Nkolbisson,Nitisol,1,FC,Rep2,3.0,187.2,4.29,31.88,0.12,1.55,5.51,3.34,27,9.18,5.57


In [8]:
print(f"The rows and columns in the dataset are {df.shape} respectively.")

The rows and columns in the dataset are (120, 16) respectively.


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   location                          120 non-null    object 
 1   soilType                          120 non-null    object 
 2   varietalCode                      120 non-null    int64  
 3   fertilizerCode                    120 non-null    object 
 4   reps                              120 non-null    object 
 5   branchNumber                      120 non-null    float64
 6   lengthMainStem(cm)                120 non-null    float64
 7   leafAreaIndex                     120 non-null    float64
 8   petoileLength(cm)                 120 non-null    float64
 9   weightAdventitousRoots(Kg)        120 non-null    float64
 10  dryBiomass(Kg)                    120 non-null    float64
 11  totalWeightStorageRoots(Kg)       120 non-null    float64
 12  weightMa

In [10]:
df["fertilizerCode"].unique()

array(['Control', 'FC', 'NPK20-10-10', 'PL', 'Tithonia', 'NPK6-15-28',
       'RHB', 'FC/NPK20-10-10', 'PL/NPK20-10-10', 'RHB/NPK20-10-10'],
      dtype=object)

In [11]:
df["location"] = df["location"].replace({"Nkolbisson": 1, "Njombe": 0})

In [12]:
df["soilType"] = df["soilType"].replace({"Nitisol": 1, "Andosol": 0})

In [13]:
df["varietalCode"] = df["varietalCode"].replace({2: 1, 1: 0})

In [14]:
df["fertilizerCode"] = le.fit_transform(df["fertilizerCode"])

In [15]:
df["reps"] = le.fit_transform(df["reps"])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   location                          120 non-null    int64  
 1   soilType                          120 non-null    int64  
 2   varietalCode                      120 non-null    int64  
 3   fertilizerCode                    120 non-null    int32  
 4   reps                              120 non-null    int32  
 5   branchNumber                      120 non-null    float64
 6   lengthMainStem(cm)                120 non-null    float64
 7   leafAreaIndex                     120 non-null    float64
 8   petoileLength(cm)                 120 non-null    float64
 9   weightAdventitousRoots(Kg)        120 non-null    float64
 10  dryBiomass(Kg)                    120 non-null    float64
 11  totalWeightStorageRoots(Kg)       120 non-null    float64
 12  weightMa

In [17]:
df.head(2)

Unnamed: 0,location,soilType,varietalCode,fertilizerCode,reps,branchNumber,lengthMainStem(cm),leafAreaIndex,petoileLength(cm),weightAdventitousRoots(Kg),dryBiomass(Kg),totalWeightStorageRoots(Kg),weightMarketableStorageRoots(Kg),totalNumberStorageRoots,totalYield(T/ha),marketableYield(T/ha)
0,1,1,0,0,0,2.0,300.06,1.99,28.04,0.18,1.32,2.91,1.66,31,4.85,2.77
1,1,1,0,0,1,2.0,304.2,2.0,28.02,0.18,1.29,2.97,1.64,24,4.95,2.73


# Splitting the dataset for training and prediction

In [18]:
X = df.drop(["marketableYield(T/ha)", "totalYield(T/ha)"
], axis = "columns")

In [19]:
y = df["marketableYield(T/ha)"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

# Fitting the data set

## LinearRegression

In [21]:
log = LinearRegression()

In [22]:
log.fit(X_train, y_train)

LinearRegression()

In [23]:
logFit = log.fit(X_train, y_train)

In [24]:
log.predict(X_test)

array([18.28257437, 21.59809084, 11.01640824,  9.00022581, 11.49924929,
        8.33449961,  8.56641175, 12.99891914,  9.83435872, 16.83629637,
        4.93110546,  4.66575177,  4.69984531,  1.86652872,  8.5003905 ,
       10.83138142,  7.6647073 ,  8.48324645, 16.83519756, 26.99786654,
       24.33390871, 21.29799634,  4.99713363,  2.73789633, 17.66779693,
       18.91616771,  5.69950256, 13.33359317,  9.34767787, 15.33259821,
       12.46681621,  9.33017964,  6.46839625, 26.49925119, 18.86538991,
        8.66678648])

In [25]:
logPred = log.predict(X_test)

### Evaluation

In [26]:
mean_absolute_error(y_test, logPred)

0.0027988434620265603

In [27]:
mean_squared_error(y_test, logPred)

1.1106285946288163e-05

## GammaRegressor

In [28]:
gamma = GammaRegressor()

In [29]:
gammaFit = gamma.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [30]:
gamma.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


GammaRegressor()

In [31]:
gammaPred = gamma.predict(X_test)

In [32]:
gamma.predict(X_test)

array([16.43041318, 22.69579082, 11.07595074,  8.89734376, 11.84708999,
        7.45134989,  7.93376985, 11.35912746,  8.76121148, 14.52535315,
        5.51254058,  5.68596688,  5.75930234,  4.41777736,  8.43456679,
       10.98548545,  7.71383512,  7.70360057, 13.90549802, 33.57628483,
       29.30026389, 21.50447153,  7.37558218,  3.99810858, 16.92489973,
       20.34641765,  6.41286392, 12.39988661,  8.81164594, 14.25121788,
       11.44421503,  8.10788341,  6.47553238, 32.19078528, 20.05937912,
        8.03071115])

### Evaluation

In [33]:
mean_absolute_error(y_test, gammaPred)

1.3819648535860174

In [34]:
mean_squared_error(y_test, gammaPred)

4.185134014132596

 ## HuberRegressor

In [35]:
huber = HuberRegressor()

In [36]:
huberFit = huber.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [37]:
huber.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


HuberRegressor()

In [38]:
huberPred = huber.predict(X_test)

In [39]:
huber.predict(X_test)

array([18.21801391, 21.50932836, 10.9448451 ,  8.96153862, 11.52428952,
        8.42581242,  8.49182239, 13.12216337,  9.80597944, 16.89447705,
        4.93450151,  4.70239275,  4.67781596,  2.09125401,  8.54540743,
       10.8246946 ,  7.70214127,  8.48629269, 16.91384118, 26.94432838,
       24.29406023, 21.27482853,  4.93460339,  2.7267311 , 17.62624354,
       18.87236293,  5.60956778, 13.32990082,  9.39238702, 15.30891964,
       12.43588933,  9.4473065 ,  6.46178349, 26.50836656, 18.78069202,
        8.56723615])

### Evaluation

In [40]:
mean_absolute_error(y_test, huberPred)

0.0533179884786705

In [41]:
mean_squared_error(y_test, huberPred)

0.00480420044611966

## PassiveAggressiveRegressor

In [42]:
passive = PassiveAggressiveRegressor()

In [43]:
passiveFit = passive.fit(X_train, y_train)

In [44]:
passive.fit(X_train, y_train)

PassiveAggressiveRegressor()

In [45]:
passivePred = passive.predict(X_test)

In [46]:
passive.predict(X_test)

array([17.44841748, 21.21902024, 11.33603242,  8.47284755, 10.86908786,
        6.99728288,  7.70898952, 12.39386353,  8.83290045, 15.24225065,
        4.1387388 ,  4.77044481,  4.58162458,  2.39162397,  8.15579061,
       10.42515232,  8.1117467 ,  7.53244042, 15.51973115, 25.66639982,
       23.49531968, 20.82265213,  5.66416394,  0.15813481, 16.8050795 ,
       19.3352007 ,  5.36662592, 13.00009759,  9.87503498, 14.84007481,
       12.49761523,  8.2106681 ,  5.80777556, 24.91934329, 19.2981758 ,
        7.96586117])

### Evaluation

In [47]:
mean_absolute_error(y_test, passivePred)

0.7337185590169034

In [48]:
mean_squared_error(y_test, passivePred)

0.7864339515756948

## PoissonRegressor

In [49]:
poison = PoissonRegressor()

In [50]:
poisonFit = poison.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [51]:
poison.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


PoissonRegressor()

In [52]:
poisonPred = poison.predict(X_test)

In [53]:
poison.predict(X_test)

array([16.69335571, 20.98942208, 11.05612724,  8.67059305, 11.46021602,
        7.36028562,  7.54217064, 12.45503   ,  8.75209426, 14.88036567,
        6.17052387,  5.825454  ,  5.73893627,  4.75984233,  7.94008206,
       11.73163591,  8.27338466,  7.22343782, 14.96005571, 32.08087812,
       26.91467052, 19.4403316 ,  7.77250917,  4.19235091, 16.93464809,
       17.26985366,  6.42683979, 12.6507909 ,  9.5183034 , 15.12728443,
       11.37023186,  8.91572991,  6.24373011, 29.63909375, 17.54346856,
        7.99153373])

### Evaluation

In [54]:
mean_absolute_error(y_test, poisonPred)

1.2359955126567184

In [55]:
mean_squared_error(y_test, passivePred)

0.7864339515756948

## QuantileRegressor

In [56]:
quant = QuantileRegressor()

In [57]:
quantFit = quant.fit(X_train, y_train)

In [58]:
quant.fit(X_train, y_train)

QuantileRegressor()

In [59]:
quantPred = quant.predict(X_test)

In [60]:
quant.predict(X_test)

array([11.57222248, 11.10171932, 11.86371825,  8.7798287 , 13.71868895,
        7.6886695 ,  7.86622262, 10.63437753,  7.3183456 , 10.913459  ,
        6.95885519,  6.33094861,  7.25089909, 10.34570621,  8.21635463,
       11.28494187,  5.89427389,  7.79418127,  9.17228303, 10.94172319,
       12.81829813, 11.34952231, 11.77355042,  2.69867638, 10.39422556,
       13.52960685,  6.38380996, 11.28102172, 10.91065564, 11.8619265 ,
       11.04637101,  8.63890738,  7.00672151, 12.16712239, 12.6317241 ,
        9.67568796])

### Evaluation

In [61]:
mean_absolute_error(y_test, quantPred)

4.088420015255931

In [62]:
mean_squared_error(y_test, poisonPred)

2.5760433763847286

## RANSACRegressor

In [63]:
ran = RANSACRegressor()

In [64]:
ranFit = ran.fit(X_train, y_train)

In [65]:
ran.fit(X_train, y_train)

RANSACRegressor()

In [66]:
ranPred = ran.predict(X_test)

In [67]:
ran.predict(X_test)

array([18.28257437, 21.59809084, 11.01640824,  9.00022581, 11.49924929,
        8.33449961,  8.56641175, 12.99891914,  9.83435872, 16.83629637,
        4.93110546,  4.66575177,  4.69984531,  1.86652872,  8.5003905 ,
       10.83138142,  7.6647073 ,  8.48324645, 16.83519756, 26.99786654,
       24.33390871, 21.29799634,  4.99713363,  2.73789633, 17.66779693,
       18.91616771,  5.69950256, 13.33359317,  9.34767787, 15.33259821,
       12.46681621,  9.33017964,  6.46839625, 26.49925119, 18.86538991,
        8.66678648])

### Evaluation

In [68]:
mean_absolute_error(y_test, ranPred)

0.0027988434620273006

In [69]:
mean_squared_error(y_test, quantPred)

34.47317020908632

## TheilSenRegressor

In [70]:
the = TheilSenRegressor()

In [71]:
theFit = the.fit(X_train, y_train)

In [72]:
the.fit(X_train, y_train)

TheilSenRegressor(max_subpopulation=10000)

In [73]:
thePred = the.predict(X_test)

In [74]:
the.predict(X_test)

array([18.28296214, 21.59843646, 11.0167776 ,  9.00027098, 11.49954009,
        8.33401914,  8.56650857, 12.99873776,  9.83454002, 16.8360189 ,
        4.93117417,  4.66540094,  4.69973132,  1.86583963,  8.50002535,
       10.83181022,  7.66443916,  8.48306192, 16.83477946, 26.99809621,
       24.33415035, 21.29815719,  4.99775362,  2.73800493, 17.66807953,
       18.91641552,  5.69975487, 13.33404222,  9.34780958, 15.33295443,
       12.46706309,  9.32984285,  6.46827579, 26.49920798, 18.8657374 ,
        8.66774275])

### Evaluation

In [75]:
mean_absolute_error(y_test, thePred)

0.0027351757137611646

In [76]:
mean_squared_error(y_test, thePred)

1.0842846948518258e-05

## TweedieRegressor

In [77]:
twee = TweedieRegressor()

In [78]:
tweeFit = twee.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [79]:
twee.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


TweedieRegressor()

In [80]:
tweePred = twee.predict(X_test)

In [81]:
twee.predict(X_test)

array([18.00935702, 21.68692843, 11.50115286,  9.10685594, 10.68987123,
        8.0997407 ,  8.94569969, 12.61401902,  9.54058872, 15.59915777,
        5.34888974,  5.75401376,  5.78619483,  3.06173701,  8.7470683 ,
       10.16528652,  8.37840043,  8.77155571, 16.00721369, 26.00217149,
       23.46089365, 21.38692604,  5.61173297,  2.25138533, 16.71201486,
       19.29246901,  6.41264296, 12.77723882,  9.94651193, 14.79782201,
       12.73050131,  9.3906688 ,  6.96902247, 25.22448487, 19.45769814,
        8.18506368])

### Evaluation

In [82]:
mean_absolute_error(y_test, tweePred)

0.575565581833135

In [83]:
mean_squared_error(y_test, tweePred)

0.4455015245041937

## AdaBoostRegressor

In [84]:
ada = AdaBoostRegressor()

In [85]:
adaFit = ada.fit(X_train, y_train)

In [86]:
ada.fit(X_train, y_train)

AdaBoostRegressor()

In [87]:
adaPred = ada.predict(X_test)

In [88]:
ada.predict(X_test)

array([17.27833333, 21.81222222, 11.640625  ,  9.10222222, 11.1225    ,
        8.63363636,  8.97545455, 12.2325    ,  9.41833333, 15.958     ,
        5.27583333,  5.21954545,  5.21954545,  2.35888889,  8.92851852,
       11.1225    ,  7.65733333,  8.92851852, 15.958     , 25.665     ,
       25.665     , 21.81222222,  5.278     ,  2.35      , 18.        ,
       18.235     ,  5.7405    , 12.2325    ,  9.34666667, 15.78833333,
       12.458     ,  9.118     ,  6.28285714, 25.57      , 18.20714286,
        8.92851852])

### Evaluation

In [89]:
mean_absolute_error(y_test, adaPred)

0.49305327581369224

In [90]:
mean_squared_error(y_test, adaPred)

0.3615978182962899

# Choosing the best model and exporting for model deployment

#### After careful consideration and analysis, we discovered that the best model worthy of deployment is our Linear Regression Model.

#### Exporting:

In [111]:
log = LinearRegression()

In [92]:
import pickle

In [112]:
filename = "sweet.sav"
pickle.dump(logFit, open(filename, "wb"))

In [105]:
df["fertilizerCode"].unique()

array([0, 1, 3, 5, 9, 4, 7, 2, 6, 8])

In [107]:
y_train = "y.sav"
pickle.dump(log, open(y_train, "wb"))

In [109]:
X_train = "X.sav"
pickle.dump(log, open(X_train, "wb"))

In [None]:
fd = pd.read_excel("potato.xlsx")

In [None]:
fd["fertilizerCode"].unique()