## Decision Tree Regression

In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("Cotton.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166585 entries, 0 to 166584
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Region                  166585 non-null  object 
 1   Soil_Type               166585 non-null  object 
 2   Crop                    166585 non-null  object 
 3   Rainfall_mm             166585 non-null  float64
 4   Temperature_Celsius     166585 non-null  float64
 5   Fertilizer_Used         166585 non-null  bool   
 6   Irrigation_Used         166585 non-null  bool   
 7   Weather_Condition       166585 non-null  object 
 8   Days_to_Harvest         166585 non-null  int64  
 9   Yield_tons_per_hectare  166585 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 10.5+ MB


In [7]:
data.isnull()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
166580,False,False,False,False,False,False,False,False,False,False
166581,False,False,False,False,False,False,False,False,False,False
166582,False,False,False,False,False,False,False,False,False,False
166583,False,False,False,False,False,False,False,False,False,False


In [9]:
data.isnull().sum()

Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64

In [11]:
data.isnull().sum().sum()

0

In [13]:
df = pd.DataFrame(data)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
categorical_features = ["Region", "Soil_Type", "Fertilizer_Used", "Irrigation_Used", "Weather_Condition"]

In [17]:
label_encoders = {}

In [18]:
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

In [23]:
X = data.drop(columns=["Yield_tons_per_hectare","Crop"])
y = data["Yield_tons_per_hectare"]

In [25]:
X

Unnamed: 0,Region,Soil_Type,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest
0,3,4,897.077239,27.676966,0,1,0,122
1,0,4,145.300681,19.755535,1,1,0,141
2,2,3,607.150252,15.562163,1,1,2,136
3,1,1,416.898632,23.190810,1,1,2,95
4,0,1,990.267439,24.072052,0,1,2,110
...,...,...,...,...,...,...,...,...
166580,1,3,217.387025,27.306320,0,0,0,137
166581,2,2,528.002127,37.384126,0,1,1,62
166582,1,0,959.604272,32.334733,0,0,0,109
166583,2,2,315.605610,30.119779,1,1,0,118


In [27]:
y

0         6.555816
1         4.367612
2         6.525186
3         4.858924
4         6.187396
            ...   
166580    1.001849
166581    3.571633
166582    5.097432
166583    4.802915
166584    5.763182
Name: Yield_tons_per_hectare, Length: 166585, dtype: float64

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [31]:
from sklearn.tree import DecisionTreeRegressor

In [33]:
model = DecisionTreeRegressor()

In [35]:
model.fit(X_train, y_train)

In [36]:
predictions = model.predict(X_test)

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [41]:
MSE = mean_squared_error(y_test, predictions)

In [43]:
print(f"Mean Squared Error: {MSE}")

Mean Squared Error: 0.5328024149623037


In [45]:
MAE = mean_absolute_error(y_test, predictions)

In [47]:
print(f"Mean Absolute Error: {MAE}")

Mean Absolute Error: 0.5822081477258271


In [49]:
RSquared = r2_score(y_test, predictions)

In [51]:
print(f"RSquared Score: {RSquared}")

RSquared Score: 0.815454997670698


# Hyperparameter Tuning using GridSearchCV

In [57]:
from sklearn.model_selection import GridSearchCV

In [65]:
param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ['auto', 'sqrt', 'log2', None]
}

In [67]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring="neg_mean_squared_error")

In [71]:
grid_search.fit(X_train, y_train)

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\suren\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\suren\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\suren\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\suren\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

In [73]:
grid_search.best_params_

{'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10}

In [75]:
best_model = grid_search.best_estimator_

In [77]:
best_model.fit(X_train, y_train)

In [79]:
predictions = best_model.predict(X_test)

In [81]:
MSE = mean_squared_error(y_test, predictions)

In [83]:
print(f"Mean Squared Error: {MSE}")

Mean Squared Error: 0.2635642450846912


In [87]:
MAE = mean_absolute_error(y_test, predictions)

In [89]:
print(f"Mean Absolute Error: {MAE}")

Mean Absolute Error: 0.4084062272014661


In [91]:
RSquared = r2_score(y_test, predictions)

In [93]:
print(f"RSquared Score: {RSquared}")

RSquared Score: 0.9075794758028285


# Hyperparameter Tuning with Optuna

In [96]:
import optuna

In [102]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    #Define and Train the Model
    model = DecisionTreeRegressor(
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf,
        max_features = max_features,
        random_state = 42
    )
    model.fit(X_train, y_train)
    #Predict and Evaluate using Mean Squared Error
    predictions = model.predict(X_test)
    MSE = mean_squared_error(y_test, predictions)
    return MSE

In [106]:
#Create an Optuna Study and Optimize it
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)

[I 2025-02-28 10:00:06,812] A new study created in memory with name: no-name-e520e255-6185-4553-84f8-e7ccd8dbfe6d
[I 2025-02-28 10:00:07,014] Trial 0 finished with value: 0.3311063549867079 and parameters: {'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.3311063549867079.
[I 2025-02-28 10:00:07,197] Trial 1 finished with value: 0.4085313820182957 and parameters: {'max_depth': 19, 'min_samples_split': 18, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.3311063549867079.
[I 2025-02-28 10:00:07,362] Trial 2 finished with value: 0.40743456512626286 and parameters: {'max_depth': 10, 'min_samples_split': 14, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.3311063549867079.
[I 2025-02-28 10:00:08,065] Trial 3 finished with value: 0.35816313205484956 and parameters: {'max_depth': 20, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': None}. Best is tri

In [108]:
study.best_params

{'max_depth': 9,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [110]:
study.best_value

0.2578706773084074

In [112]:
best_params = study.best_params

In [114]:
best_model = DecisionTreeRegressor(**best_params)

In [118]:
best_model.fit(X_train, y_train)

In [122]:
predictions = best_model.predict(X_test)

In [124]:
MSE = mean_squared_error(y_test, predictions)

In [126]:
print(f"Mean Squared Error: {MSE}")

Mean Squared Error: 0.2578706773084074


In [128]:
MAE = mean_absolute_error(y_test, predictions)

In [130]:
print(f"Mean Absolute Error: {MAE}")

Mean Absolute Error: 0.4042926718067476


In [132]:
RSquared = r2_score(y_test, predictions)

In [134]:
print(f"RSquared Score: {RSquared}")

RSquared Score: 0.9095759625351892
