In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import numpy as np
import joblib

In [None]:
# Read the encoded DataFrame from the specified file path and store it in a DataFrame named 'train_test_df'
train_test_df = pd.read_csv('/content/drive/MyDrive/CSCT MASTER PROJECT/df_encoded.csv')

In [None]:
# Get the data types of each column in the 'train_test_df' DataFrame and store them in 'training_data_dtypes'
training_data_dtypes = train_test_df.dtypes

In [None]:
# Display the data types of columns in the training data
print("Training Data Column Data Types:")
print(training_data_dtypes)

Training Data Column Data Types:
State                 int64
District              int64
Crop                  int64
Year                  int64
Season                int64
Area                float64
Area Units            int64
Production          float64
Production Units      int64
Yield               float64
dtype: object


In [None]:
# Create a DataFrame 'features' by dropping specified columns
# - 'Unnamed: 0': An unnamed column that may represent an index; not needed as a feature.
# - 'Production' and 'Production Units': These columns contain total production data, which is not used as a feature for yield prediction.
features = train_test_df.drop(['Production','Production Units'], axis=1)

In [None]:
# Define target variable 'Yield'
target = train_test_df['Yield']

In [None]:
# Split the dataset into training and testing subsets.
# - 'features': The input features for the machine learning model.
# - 'target': The target variable (crop yield) that the model aims to predict.
# - 'test_size': The proportion (25%) of the dataset to use for testing, with the remainder used for training.
# - 'random_state': A seed value (42) for the random number generator, ensuring reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

In [None]:
##Linear Regression Model

In [None]:
# Create a machine learning pipeline for the Linear Regression model.
# - The pipeline is defined as a sequence of processing steps.
# - In this case, the only step included is 'model', which represents the Linear Regression model.
# - Additional steps can be added here for data preprocessing, feature engineering, etc., as needed.
pipeline = Pipeline([
    ('model', LinearRegression())  #
])

In [None]:
print(X_train.dtypes)

State           int64
District        int64
Crop            int64
Year            int64
Season          int64
Area          float64
Area Units      int64
Yield         float64
dtype: object


In [None]:
# Fit (train) the machine learning pipeline on the training data.
# - This step trains the Linear Regression model on the training dataset (X_train and y_train).
# - After this operation, the model will have learned patterns from the training data.
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on the test data using the trained machine learning pipeline.
# - This step uses the trained Linear Regression model to predict target values (y_pred)
#   based on the input features of the test data (X_test).
y_pred = pipeline.predict(X_test)

In [None]:
y_pred

array([18.00002183,  1.        ,  0.90322581, ...,  0.66438425,
        0.63333333,  2.61380078])

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 4.336350545299682e-25


In [None]:
# Calculate R-squared (coefficient of determination)
r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")

R-squared: 1.0


In [None]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 2.6418641101789177e-13


In [None]:
## Decision Tree Regression

In [None]:
#  Initialize the DecisionTreeRegressor with a specific random state
tree_regressor = DecisionTreeRegressor(random_state=42)

In [None]:
# Fit the DecisionTreeRegressor model to the training data
tree_regressor.fit(X_train, y_train)


In [None]:
# Predict the target variable on the test data
y_pred = tree_regressor.predict(X_test)

In [None]:
y_pred

array([18.        ,  1.        ,  0.90322581, ...,  0.66438356,
        0.63333333,  2.61375661])

In [None]:
# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 48.27348607681996


In [None]:
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.12371247835482


In [None]:
# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.9999406511102099


In [None]:
## Random Forest Regressor

In [None]:
# Create a Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Fit the model on the training data
rf_regressor.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred_rf = rf_regressor.predict(X_test)

In [None]:
# Save the trained model to a pkl file
joblib.dump(rf_regressor, '/content/drive/MyDrive/CSCT MASTER PROJECT/random_forest_model.pkl')

['/content/drive/MyDrive/CSCT MASTER PROJECT/random_forest_model.pkl']

In [None]:
y_pred_rf

array([18.00001688,  1.        ,  0.90322581, ...,  0.6643835 ,
        0.63333385,  2.61385859])

In [None]:
# Calculate Mean Absolute Error (MAE)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Mean Absolute Error:", mae_rf)

Mean Absolute Error: 0.07655571807898291


In [None]:
# Calculate R-squared score
r2_rf = r2_score(y_test, y_pred_rf)
print("R-squared:", r2_rf)

R-squared: 0.9999806727439337


In [None]:
# Calculate Mean Squared Error (MSE)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error:", mse_rf)

Mean Squared Error: 15.720496708884033
