<a href="https://colab.research.google.com/github/Sneha-s-kumarr/final-project-1/blob/main/Crop_Yield_prediction_regression_task_feature_analysis_05_12_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Display all columns
pd.set_option('display.max_columns', None)

pd.set_option('display.float_format', '{:.5f}'.format)

In [None]:
# Loading the dataset
file_url= 'https://raw.githubusercontent.com/ManjuVijayakumar/ICTAK_DSA_FinalProject/refs/heads/main/crop_yield_main.csv'
yield_df = pd.read_csv(file_url, index_col= 'Unnamed: 0')

# First 5 rows
yield_df.head()

In [None]:
# Last 5 rows
yield_df.tail()

In [None]:
# Shape of the dataset
print('Dataset has {} row and {} columns'.format(yield_df.shape[0], yield_df.shape[1]))

In [None]:
# Dataset Information
yield_df.info()

In [None]:
print('No of numeric columns : {} '.format(len(yield_df.select_dtypes(include = np.number).columns)))
print('No of categorical columns : {} '.format(len(yield_df.select_dtypes(include= 'object').columns)))


In [None]:
# Missing values
print('Total no. of missing values : {}'.format(yield_df.isnull().sum().sum()))

In [None]:
# yield_df.columns

### Bookmark1-Manju

**Analyzing Features: Crop, Crop_Year, Season, State, Area, Production, Annual_Rainfall, Fertilizer, Pesticide, Yield, Country, Soil_type**

In [None]:
data= yield_df[['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
       'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield', 'Country',
       'Soil_type']]
data.columns

In [None]:
# Missing Values
data.isnull().sum()

In [None]:
# Duplicates
data.duplicated().sum()

In [None]:
# Statistical Analysis
data.describe().T

In [None]:
data.describe(include= 'object').T

**Insights**

* Dataset has 19689 row and 45 columns
* Categorical: 5 and Numerical: 40
* Total no. of missing values : 61732
* Statistical summary of first 12 columns shows presence of positive skewness in Area, Production, Annual_Rainfall, Fertilizer, Pesticide and Yield



#### **Univariate Analysis**
It's the study of single variable in dataset. The underlying distribution of variable can be analyzed.

In [None]:
# Categorical columns
cat_cols= data.select_dtypes(include= 'object').columns

# Numerical columns
num_cols= data.select_dtypes(include= np.number).columns

In [None]:
# Categorical column analysis
for feature in cat_cols:
  print(data[feature].value_counts(),'\n')

In [None]:
# Country has unique value. This unique feature will be irrelevant to the model's predictive performance and can be ignored/dropped
# Country

In [None]:
fig, ax= plt.subplots(3, 3, figsize= (13, 10))
ax= ax.flatten()

for i, c in enumerate(data[num_cols]):
  sns.distplot(x= data[c], ax= ax[i], axlabel= c, color= 'seagreen')
plt.delaxes(ax[7])
plt.delaxes(ax[8])
plt.suptitle('Distribution plot of Numerical variables')
plt.show()

In [None]:
for feat in data[num_cols]:
  print(feat,'',data[feat].skew())

In [None]:
fig, ax= plt.subplots(3, 3, figsize= (13, 10))
ax= ax.flatten()

for i, c in enumerate(data[num_cols]):
  if c in ['Crop_Year', 'Annual_Rainfall']:
    sns.distplot(x= data[c], ax= ax[i], axlabel= c, color= 'seagreen')
  else:
    sns.distplot(x= np.log1p(data[c]), ax= ax[i], axlabel= c, color= 'seagreen')
plt.delaxes(ax[7])
plt.delaxes(ax[8])
plt.suptitle('Distribution plot of Numerical variables')
plt.show()

In [None]:
cat_cols

In [None]:
fig, ax= plt.subplots(3, 2, figsize= (20, 15))
ax= ax.flatten()

for i, c in enumerate(data[cat_cols]):
  sns.countplot(y= data[c], ax= ax[i], color= 'seagreen')
  # ax[i].tick_params(axis='x', labelrotation=90)
plt.delaxes(ax[5])
plt.suptitle('Count plot of Categorical variables')

plt.show()

In [None]:
# Count details for each crop. Top 10

crop_counts= yield_df.Crop.value_counts().sort_index()

plt.figure(figsize= (10, 5))
plt.bar(crop_counts.index, crop_counts.values, color= 'skyblue')
plt.xlabel('Crop')
plt.ylabel('Count')
plt.title('Crop Name')
plt.xticks(rotation= 90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

#### **Insights**

*   Country has unique value. This unique feature will be irrelevant to the model's predictive performance and can be ignored/dropped
*   Distribution plot shows features Area, Production, Fertilizer, Pesticide and Yield are highly positively skewed.
*   Log transformation applied to features that are highly skewed. Log-transforming it helps stabilize variance, reduce skewness, and makes relationships with predictors more linear, which is better for visualization, correlation and modeling.





#### **Bivariate Analysis**
Its the statistical method to analyze the relationship between two features.

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
# Annual Rainfall vs Yield
sns.regplot (
    x= np.log1p(data.Annual_Rainfall),
    y= np.log1p(data.Yield),
    scatter_kws= {'alpha':0.5, 'color':'skyblue'},
    line_kws= {'color':'red'}
)
plt.title('Annual Rainfall vs Yield(Log Scale)')
plt.show()

In [None]:
# Fertilizer vs Yield
sns.regplot (
    x= np.log1p(data.Fertilizer),
    y= np.log1p(data.Yield),
    scatter_kws= {'alpha':0.5, 'color':'skyblue'},
    line_kws= {'color':'red'}
)
plt.title('Log Transformed Relationship between Fertilizer Usage vs Yield')
plt.show()

In [None]:
sns.regplot (
    x= np.log1p(data.Pesticide),
    y= np.log1p(data.Yield),
    scatter_kws= {'alpha':0.5, 'color':'skyblue'},
    line_kws= {'color':'red'}
)
plt.title('Pesticide Usage (Log Scale) vs Yield(Log Scale)')
plt.show()

In [None]:
sns.regplot (
    x= np.log1p(data.Production),
    y= np.log1p(data.Yield),
    scatter_kws= {'alpha':0.5, 'color':'skyblue'},
    line_kws= {'color':'red'}
)
plt.title('Production (Log Scale) vs Yield(Log Scale)')
plt.show()

In [None]:
sns.regplot (
    x= np.log1p(data.Area),
    y= np.log1p(data.Yield),
    scatter_kws= {'alpha':0.5, 'color':'skyblue'},
    line_kws= {'color':'red'}
)
plt.title('Area (Log Scale) vs Yield(Log Scale)')
plt.show()

In [None]:
# Yield trend over the year

df_year= data.groupby('Crop_Year', as_index= False)['Yield'].sum()

plt.plot(df_year['Crop_Year'].astype(str),df_year['Yield'],color='blue', linestyle='dashed',
         marker='o',markersize=10, markerfacecolor='yellow')
plt.xlabel('Year')
plt.ylabel('Yield (Log Scale)')
plt.title('Measure of Yield over the year')
plt.xticks(rotation= 90)
plt.grid()
plt.show()

In [None]:
df_state= yield_df.groupby('State', as_index= False)['Yield'].sum().sort_values(by= 'Yield', ascending= False)

sns.barplot(x= df_state.State, y= np.log1p(df_state.Yield), palette= 'Set2')
plt.xticks(rotation = 90)
plt.ylabel('Yield (Log Scale)')
plt.title('Crop Yield by State')
plt.show()

In [None]:
df_crop= data.groupby('Crop', as_index= False)['Yield'].sum().sort_values(by= 'Yield', ascending= False).head(20)

# plt.figure(figsize= (15, 10))
sns.barplot(x= df_crop.Crop, y= np.log1p(df_crop.Yield), palette= 'Set2')
plt.xticks(rotation = 90)
plt.ylabel('Yield (Log Scale)')
plt.title('Yield (Log Scale) distribution across Crops (Top 20)')
plt.show()

In [None]:
df_season= data.groupby('Season', as_index= False)['Yield'].sum().sort_values(by= 'Yield', ascending= False).head(20)

# plt.figure(figsize= (15, 10))
sns.barplot(x= df_season.Season, y= np.log1p(df_season.Yield), palette= 'Set2')
plt.xticks(rotation = 90)
plt.ylabel('Yield (Log Scale)')
plt.title('Yield (Log Scale) distribution across Season')
plt.show()

In [None]:
df_soil= data.groupby('Soil_type', as_index= False)['Yield'].sum().sort_values(by= 'Yield', ascending= False).head(20)

# plt.figure(figsize= (15, 10))
sns.barplot(x= df_soil.Soil_type, y= np.log1p(df_soil.Yield), palette= 'Set2')
plt.xticks(rotation = 90)
plt.ylabel('Yield (Log Scale)')
plt.title('Yield (Log Scale) distribution across Soil Type')
plt.show()

**Insights**

1. The regplot between Annual Rainfall and Yield gives a nearly horizontal line which means there is no meaninful linear relationship between the two variables. The scatter points are widely dispersed, suggesting the features does not explain variations in yield.

2. There is a slight positive slope when analyzing features like Pesticide, Area and Fertilizer. This indicates as Variale X increases, Variable Y also tends to increase.

3. The regplot between Production and Yield shows a moderatately strong positive correlation (r = 0.57)

4. Year 2014 shows highest yield.

5. West Bengal records the highest overall yield among all States

6. Coconut has the highest yield among all crops in the dataset followed by Sugarcane.


In [None]:
data[num_cols].corr()

In [None]:
sns.heatmap(data[num_cols].corr(), annot= True, cmap= 'coolwarm', fmt= '.3f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
cat_cols
# Label encoding - ordinal
# One hot encoding - less cardinality - dummy variable
print(data['Crop'].nunique())
print(data['Season'].nunique())
print(data['Soil_type'].nunique())

print(data['State'].nunique())

#### **Insights**

1. The regplot between Annual Rainfall and Yield gives a nearly horizontal line which means there is no meaninful linear relationship between the two variables. The scatter points are widely dispersed, suggesting the features does not explain variations in yield.

2. There is a slight positive slope when analyzing features like Pesticide, Area and Fertilizer. This indicates as Variable X increases, Variable Y also tends to increase.

3. The regplot between Production and Yield shows a moderatately strong positive correlation (r = 0.57)

4. Year 2014 shows highest yield.

5. West Bengal records the highest overall yield among all States

6. Coconut has the highest yield among all crops in the dataset followed by Sugarcane.


---



*  The heatmap of numerical variables shows a **strong positive correlation between features Pesticide and Fertilizer, Area and Pesticide, Area and Fertilizer. Can try dropping one of them while modelling** to see if there is any performance improvement.
*  The slight positive slope displayed by features Area, Pesticide and Fertilizer was just noise (r ≈ 0.002). The correlation values indicates no meaningful linear relationship.


---



Crop_Year showed almost no linear relationship with Yield (correlation = 0.003). Since the project is not time-series based and year does not causally influence yield, the feature was excluded from modeling to avoid noise and overfitting.

Country has unique value. This unique feature will be irrelevant to the model's predictive performance and can be ignored/dropped.

Feature	  Keep/Drop	Reason	Recommended Encoding <br>
Crop (55)	Target / Frequency Encoding <br>
State (30)	One-hot (tree models) or Target (linear) <br>
Season (6)		One-hot <br>
Soil Type (7)	One-hot

Target Encoding replaces each category with the mean of the target-variable (Yield) of that category <br>
**category_encoders** library


---
**Features that can be dropped:** <br>
Country<br>
Crop_Year<br>
The heatmap of numerical variables shows a **strong positive correlation between features Pesticide and Fertilizer, Area and Pesticide, Area and Fertilizer. Can try dropping one of them while modelling** to see if there is any performance improvement.


### Bookmark2-Darsana

In [None]:
temp_cols = [
    'Annual_mean_temp','Apr_temp','Aug_temp','Dec_temp','Feb_temp',
    'Jan_temp','Jul_temp','Jun_temp','Mar_temp','May_temp','Monsoon_temp'
]

temp_df = yield_df[temp_cols]

In [None]:
temp_df.describe()

#### **Univariate Analysis**

In [None]:
temp_df.hist(figsize=(14,10))
plt.suptitle("Histograms for Temperature Variables", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=temp_df)
plt.title("Boxplot of Temperature Variables", fontsize=16)
plt.xticks(rotation=45)
plt.show()

#### **Insights**

Summary of Temperature Distributions:<br>
* Most temperature variables are unimodal and show clear seasonal patterns.<br>
* Summer and monsoon months (May, Jun, Jul, Aug, Monsoon, Annual_mean_temp) are left-skewed, with higher temperatures (20–30°C) dominating. <br>
* Winter months (Dec, Jan, Feb) are right-skewed, with most temperatures in the lower ranges (5–10°C) and a tail extending to higher values.<br>
* March is nearly symmetric around 10–15°C, and April is multimodal, indicating two different temperature peaks (10–15°C and 20–25°C).

Boxplot Summary
* Compared 11 temperature variables.<br>
* Feb_temp, Dec_temp, and May_temp show noticeable outliers.<br>
* Median differs across months (Annual_mean_temp ≈ 25°C, Apr_temp ≈ 30°C).<br>
* Monsoon_temp and Jul_temp have the smallest IQR → least temperature variation.

#### **Bivariate Analysis**

In [None]:
corr_matrix = temp_df.corr()
corr_matrix

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap of Temperature Variables", fontsize=16)
plt.show()

In [None]:

temp_vars = [
    'Annual_mean_temp', 'Apr_temp', 'Aug_temp', 'Dec_temp', 'Feb_temp',
    'Jan_temp', 'Jul_temp', 'Jun_temp', 'Mar_temp', 'May_temp', 'Monsoon_temp'
]

# Loop for bivariate analysis between each temp variable and Yield
for var in temp_vars:
    print(f"\n===== BIVARIATE ANALYSIS: {var} vs Yield =====")

    # Correlation
    corr = yield_df[var].corr(yield_df['Yield'])
    print(f"Correlation ({var} vs Yield): {corr}")

    # Scatter plot
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=yield_df[var], y=yield_df['Yield'])
    plt.title(f"{var} vs Yield")
    plt.xlabel(var)
    plt.ylabel("Yield")
    plt.show()


#### **Insights**

Correlation Summary

* Annual_mean_temp strongly correlates with all months → yearly mean temp follows monthly patterns.
* Mar–Apr–May are highly linked → smooth rise into summer.
* Jun–Jul–Aug show very high correlation → stable summer temperatures.
* Dec–Jan–Feb move almost identically → consistent winter pattern.
* Monsoon_temp strongly ties with summer → influenced by pre-monsoon heat.
* Lowest correlations occur between opposite seasons (summer vs winter).
* **Annual mean temp" shows a strong positive correlation (values near 0.9 or 1) with almost all monthly temperatures**
* "Dec temp" and "Jan temp" are also very strongly correlated with each other (0.99).
* The correlation between "Aug temp" and "Jan temp" is 0.71, which is lower than many other pairings.
* This suggests that while all temperatures are related, some monthly temperatures are more closely linked than others.



* The correlation between Annual_mean_temp and crop yield is 0.049. This is a very weak positive correlation, very close to zero.<br>
**In simple words: Annual average temperature does NOT influence crop yield in this dataset.**
* The correlation between Apr_temp and crop yield is 0.0347. This is also an extremely weak positive correlation, almost zero.
* The correlation between Aug_temp and crop yield is 0.0348.<br>
**Simple conclusion: April and August mean temperature does NOT influence crop yield in this dataset.**
* The correlation between Dec_temp and crop yield is 0.0695.
* The correlation between Feb_temp and crop yield is 0.0632. This is a very weak positive correlation, close to zero.
* The correlation between Jan_temp and crop yield is 0.0702.
**Simple conclusion: December, February and January mean temperature features does NOT influence crop yield in this dataset.**

### Bookmark3-Sneha

**Analyzing Features** -

Nov_temp, Oct_temp, Post_Monsoon_temp, Sep_temp,
              Summer_temp, Winter_temp, JAN_rain, FEB_rain,
              MAR_rain, APR_rain, MAY_rain

In [None]:
feature_columns = ['Sep_temp', 'Oct_temp', 'Nov_temp',  'Post_Monsoon_temp',
              'Summer_temp', 'Winter_temp', 'JAN_rain', 'FEB_rain',
              'MAR_rain', 'APR_rain', 'MAY_rain']

Target = 'Yield'

In [None]:
feature_df = yield_df[feature_columns]
feature_df.head()

In [None]:
feature_df.isnull().sum()

In [None]:
feature_df.describe().T

#### **Univariate Analysis**

In [None]:
n_cols = 3

n_rows = (len(feature_df.columns) + n_cols - 1) // n_cols

plt.figure(figsize=(15, 5*n_rows))

for i, col in enumerate(feature_df.columns,1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(feature_df[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()

plt.show()

In [None]:
for feat in feature_df:
    print(feat, feature_df[feat].skew())

In [None]:
def detect_outliers(col):
    Q1 = feature_df[col].quantile(0.25)
    Q3 = feature_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = feature_df[(feature_df[col] < lower) | (feature_df[col] > upper)]
    print(f"{col}: {len(outliers)} outliers")

num_cols = feature_df.columns

for col in num_cols:
    detect_outliers(col)

#### **Insights**

***Temperature Columns (Sep, Oct, Nov, Summer, Winter, Post-Monsoon Temps)***

All temperature features are negatively skewed, meaning most temperature values are on the higher side.
This matches typical Indian climate ranges (around 24°C to 32°C).
Only a small number of records show cooler temperatures — likely from hilly or North-East regions.
So overall, temperatures are generally warm with few unusually low values.
Rainfall Columns (JAN–MAY Rainfall)

***Rainfall Columns***

Rainfall features show positive (right) skewness, meaning rainfall is usually low, but there are some months with extremely high rainfall.
Most locations receive very little rainfall in these months.
A few entries represent heavy rainfall events, causing the long tail on the right.
This is normal because rainfall is seasonal and uneven, unlike temperature.

#### **Bivariate Analysis**

In [None]:
temp_cols = ['Sep_temp','Oct_temp','Nov_temp','Post_Monsoon_temp',
             'Summer_temp','Winter_temp']
state_temp_mean = yield_df.groupby('State')[temp_cols].mean()

state_temp_mean.sort_values('Winter_temp').head(10)

In [None]:
feature_df.corr()

In [None]:
feature_df.loc[:, 'Yield'] = yield_df['Yield']
# feature_df.head()
plt.figure(figsize=(10, 6))
sns.heatmap(feature_df.corr(), annot= True, cmap= 'coolwarm', fmt= '.1f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
feature_df['Yield'] = yield_df['Yield']
feature_df.head()
for col in ["Sep_temp", "Oct_temp", "Nov_temp", "Post_Monsoon_temp", "Summer_temp", "Winter_temp"]:
    print("here")
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=feature_df[col], y=feature_df['Yield'])
    plt.title(f"{col} vs Yield")
    plt.xlabel(col)
    plt.ylabel("Yield")
    plt.show()

In [None]:
correlations = feature_df.corrwith(yield_df['Yield']).sort_values(ascending=True)
correlations

In [None]:
cols = [
    'Post_Monsoon_temp','Summer_temp',
    'Winter_temp'
]

# df[cols].isna().sum()

yield_df[cols].isna().mean() * 100

In [None]:
feature_df[yield_df[['Post_Monsoon_temp','Summer_temp','Winter_temp']].isna().all(axis=1)]

#### **Insights**

**Temperatures (Sep, Oct, Nov, Summer, Winter, Post-Monsoon Temps)**

All temperature features are highly correlated with each other → they move together and capture similar climate information.
They show a small positive correlation with Yield, meaning:
**Slightly warmer post-monsoon temperatures can be linked to slightly higher yields.
But this relationship is very weak and not reliable on its own.**


**Rainfall (JAN–MAY)**
Rainfall features show near-zero correlation with Yield.
This suggests:
Rainfall in these months does not influence crop productivity in the dataset.
These months likely fall outside the main crop-growing season, so their impact on yield is minimal.


**Temperature vs Rainfall**<br>
These two groups show negative correlation:
Hot regions tend to receive less rainfall.
Cooler regions tend to receive more rainfall.
This reflects normal seasonal and regional climate patterns.


**Yield** <br>
Yield shows almost no strong correlation with any of the 11 features.
This indicates:
**No single weather variable can predict yield accurately.**

If we try to use monthly temperatures, we would ideally need all 12 months for proper coverage — but adding so many features increases complexity and redundancy.

**Seasonal aggregated temperatures (Summer, Winter, Post-Monsoon) also show strong multicollinearity, so they are not ideal individually.
Temperature may still have a small logical influence on yield, and you can use a combined measure like the mean seasonal temperature if needed.**
Overall, yield is driven more by other agricultural factors

### Bookmark4-Ashir

In [None]:
#List rainfall columns
rain_cols = [
    'JUN_rain','JUL_rain','AUG_rain','SEP_rain','OCT_rain','NOV_rain','DEC_rain',
    'Jan-Feb_rain','Mar-May_rain','Jun-Sep_rain','Oct-Dec_rain'
]

target="Yield"

df_selected = yield_df[rain_cols + [target]]
df_selected.head()

#### **Univariate Analysis**

In [None]:
#histograms
df_selected.hist(figsize=(15, 12))
plt.suptitle("Univariate Distribution (Rainfall + Yield)", fontsize=18)
plt.show()

In [None]:
#boxplots
plt.figure(figsize=(15, 10))
df_selected.boxplot()
plt.title("Boxplot of Rainfall Variables and Yield")
plt.xticks(rotation=45)
plt.show()

#### **Insights**

Dataset Structure
The dataset contains rainfall columns + the target column Yield.<br>

Rainfall values are a combination of:<br>
.Monthly rainfall<br>
.Seasonal rainfall groups<br>

This automatically creates high correlation between many columns because:<br>
Seasonal rainfall = sum of multiple months.<br>
Example: Jun-Sep_rain ≈ JUN + JUL + AUG + SEP.<br>

Including both monthly and seasonal rainfall will cause multicollinearity


---


Rainfall Features:
* Most rainfall features show right-skewed distributions.

Yield:

* Highly positively skewed.
* Only a few records have very high yields.

This suggests:
* Perform log transform for modeling.


---


* Yield also shows outliers, representing high productivity regions.
* Rainfall columns contain many outliers, common in climate data.
* Boxplots confirm high variability across States and seasons.

#### **Bivariate Analysis**

In [None]:
#Scatter Plots
for col in rain_cols:
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df_selected, x=col, y=target)
    plt.title(f"{col} vs Yield")
    plt.show()

In [None]:
#Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_selected.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap: Rainfall (Jan–Dec) vs Yield")
plt.show()

In [None]:
#High correlation with yield
correlations = df_selected.corr()[target].sort_values(ascending=False)
correlations

#### **Insights**

* Monthly rainfall shows very weak correlation with Yield.
* Most scatterplots look random, indicating:<br>
    Rain alone does not determine yield.
* Yield depends on: <br>
    temperature,irrigation,soil quality,fertilizers
* Jun-Sep_rain shows slightly more influence, because monsoon season matters most

---

Correlation

A. Rainfall vs Yield

* Jun-Sep_rain has the highest correlation with Yield (still weak).

* Monthly rainfall columns have very low correlation (0.0–0.15).

* Seasonal rainfall performs better than individual months.

B. Rainfall vs Rainfall (Multicollinearity)

You will see many correlations > 0.80 because:

* 1. Jun-Sep_rain is strongly correlated with JUN_rain,JUL_rain,AUG_rain,SEP_rain
* 2. Oct-Dec_rain highly correlates with OCT_rain,NOV_rain,DEC_rain
* 3. Monthly rainfall columns are also correlated with each other

---

Why this is a problem?

**This causes multicollinearity, which:
inflates model variance reduces model stability
makes feature importance unreliable.**


**Conclusion**

* Only 1 rainfall feature has any meaningful relationship with Yield.
* All others should be dropped because their correlation values with crop yield are almost zero.
* OCT-DEC Rainfall shows high relation.

# Phase 2

#### 1) **Missing Value Identification**

In [None]:
import missingno

missingno.bar(yield_df)

In [None]:
# Calculate percentage of missing values per columns
missing_percent= (yield_df.isnull().sum() / len(yield_df)) * 100

# Combine with Counts
missing_data= pd.DataFrame(
    {
        'Missing Values' : yield_df.isnull().sum(),
        'Percentage (%)' : missing_percent
    }
).sort_values(by= 'Percentage (%)', ascending= False)

missing_data= missing_data[missing_data['Missing Values'] > 0]

print(missing_data)

In [None]:
# missing_data['Percentage (%)'].plot(kind='bar', figsize=(10,5))
# plt.title('Percentage of Missing Values per Column')
# plt.ylabel('Percentage (%)')
# plt.show()

**Analyzing Temperature columns**

In [None]:
tempe_cols =[col for col in yield_df.columns if 'temp' in col.lower()]
tempe_cols

# rows where all temperature columns are null
mask_all_temp_null = yield_df[tempe_cols].isnull().all(axis=1)
num_rows_all_temp_null = mask_all_temp_null.sum()
pct_all_temp_null = num_rows_all_temp_null / len(data) * 100

# Inspect distribution by State and Year
print(yield_df[mask_all_temp_null].groupby('State').size().sort_values(ascending=False).head(20))
print(yield_df[mask_all_temp_null].groupby('Crop_Year').size().sort_values())

In [None]:
# Jammu and Kashmir, Teleanga - all temperatue column missing
# this suggest structural missingness (NMAR)
# States that are climatologicaly similar are Himachal Pradesh and Andhra Pradesh
# If missingness concentrated in a small number of states (e.g., J&K): consider dropping those state rows OR imputing from climatologically similar states (document justification).

In [None]:
total_rows= yield_df.shape[0]

jk_rows= yield_df[yield_df.State == 'Jammu and Kashmir'].shape[0]
tl_rows= yield_df[yield_df.State == 'Telangana'].shape[0]
jk_percent= (jk_rows / total_rows) * 100
tl_percent= (tl_rows / total_rows) * 100

print('Jammu and Kashmir : Missing Percent - {}'.format(round(jk_percent, 3)))
print('Telangana : Missing Percent - {}'.format(round(tl_percent, 3)))

**Findings**

*   Jammu and Kashmir, Telangana - all temperatue column missing
*   This suggests structural missingness (NMAR) -- occurs when the probability of a value being missing is dependent on the value itself
* If missingness concentrated in a small number of states (e.g., J&K): **consider dropping those state rows** OR **imputing from climatologically similar states** (document justification).
* **Climatologically similar state for Jammu and Kashmir is Himachal Pradesh and for Telangana is Andhra Pradesh**

***1. Drop J&K and Telangana***

In [None]:
yield_df_1= yield_df.query("State not in ['Jammu and Kashmir', 'Telangana']")
yield_df_1.shape

***2. Imputing from climatologically similar states***

Instead of dropping ~5% data, climatologically similar states (Himachal Pradesh for J&K and Andhra Pradesh for Telangana) were used as reference for month-wise median imputation.

In [None]:
state_month_medians = yield_df.groupby('State')[tempe_cols].median()

In [None]:
yield_df_2= yield_df.copy()

# JK with HP
for col in tempe_cols:
    yield_df_2.loc[yield_df_2['State']=='Jammu and Kashmir', col] = \
        yield_df_2.loc[yield_df_2['State']=='Jammu and Kashmir', col].fillna(
            state_month_medians.loc['Himachal Pradesh', col]
        )

# Telangana with AP
for col in tempe_cols:
    yield_df_2.loc[yield_df_2['State']=='Telangana', col] = \
        yield_df_2.loc[yield_df_2['State']=='Telangana', col].fillna(
            state_month_medians.loc['Andhra Pradesh', col]
        )

In [None]:
# rows where all temperature columns are null
mask_all_temp_null = yield_df_2[tempe_cols].isnull().all(axis=1)
num_rows_all_temp_null = mask_all_temp_null.sum()
pct_all_temp_null = num_rows_all_temp_null / len(data) * 100

# Inspect distribution by State and Year
print(yield_df_2[mask_all_temp_null].groupby('State').size().sort_values(ascending=False).head(20))

**Analyzing Rainfall Features**

In [None]:
rain_cols = ['JAN_rain', 'FEB_rain', 'MAR_rain','APR_rain', 'MAY_rain', 'JUN_rain',
             'JUL_rain', 'AUG_rain', 'SEP_rain','OCT_rain', 'NOV_rain', 'DEC_rain',
             'Jan-Feb_rain', 'Mar-May_rain','Jun-Sep_rain', 'Oct-Dec_rain']


all_null_rain = yield_df_2[rain_cols].isnull().all(axis= 1).sum()
print('No of rows with no rainfall details available:', all_null_rain)
# Annual_Rainfall

In [None]:
null_by_year = (
            yield_df_2.groupby('Crop_Year')[rain_cols]
            .apply(lambda x: x.isnull().all().all())  # True if *all* rain cols null for the entire year
              )

# Filter years where condition is True
years_with_all_null_rain = null_by_year[null_by_year].index.tolist()

years_with_all_null_rain

In [None]:
null_rain_rows = yield_df_2[yield_df_2[rain_cols].isnull().any(axis=1)]
filtered_rows = null_rain_rows[~null_rain_rows['Crop_Year'].isin([2018, 2019, 2020])]

filtered_rows.describe(include= 'object').T

In [None]:
missing_data= pd.DataFrame(
    {
        'Missing Values' : yield_df_2.isnull().sum(),
        'Percentage (%)' : missing_percent
    }
).sort_values(by= 'Percentage (%)', ascending= False)

missing_data= missing_data[missing_data['Percentage (%)'] > 10]

print(missing_data)

**Findings**

*   Rainfall data for the year 2018, 2019 and 2020 are not available for any of the States.
* Rainfall data for the State Puducherry is not available for any of the years
*   Imputation technique? - *Annual Rainfall data available. Monthly/period rainfall can be reconstructed by allocating the known annual total into months using state-level historical month/annual fractions.*
*   This method requires imputation across 3 full years and 30 States

**Conclusion**
*   **Annual Rainfall feature shows very low correlation with Yield** (max r = 0.027)
* **Incase of Puducherry, imputing a fake rainfall data for 20 years will make rainfall predictions unrealistic (587 rows)**
*   Giving the high missingness concentrated in 2018 - 2020 and the low predictive value, **rainfall features can be dropped from further analysis**
*   This prevents imputation noise and retains a clean dataset of ~ 16900 rows




In [None]:
yield_df_2= yield_df_2.dropna(subset= rain_cols, how= 'all')

missing_data= pd.DataFrame(
    {
        'Missing Values' : yield_df_2.isnull().sum(),
        'Percentage (%)' : missing_percent
    }
).sort_values(by= 'Percentage (%)', ascending= False)

missing_data= missing_data[missing_data['Missing Values'] > 0]

print(missing_data)

In [None]:
print('No of rows after missing value removal : {}'.format(yield_df_2.shape[0]))

#### 2) **Outlier Detection**

In [None]:
def outlier_percent(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    return round(len(outliers) / len(df) * 100, 2)

numeric_cols = yield_df_2.select_dtypes(include='number').columns

outlier_table = pd.DataFrame({
    'Feature': numeric_cols,
    'Outlier_%': [outlier_percent(yield_df_2, col) for col in numeric_cols]
})

outlier_table.sort_values('Outlier_%', ascending=False).T

**Findings**

*   Univarient analysis of each feature shows the presence of outliers in almost all features.
*   The above table gives an overview of 45 features and respective outlier %
*   Rainfall, Temperature vary naturally
*   Extreme weather = extreme values
*   Crop Yield also varies highly across States and Years

**Decision**

* Data comes from government records which are actual measured values, not errors
* Removing them may distort real-world patterns
* **Removing or capping the outliers could distort actual patterns and negatively affect model performance. Therefore outliers were analyzed but not removed.**

*(Tree based models handle outliers well)*


#### 3) **Encoding Categorical Features**

In [None]:
cat_cols

**Findings**

As per the earlier detailed analysis done:

* Crop --> 55 unique crops
* Season --> 7
* State --> 30
* Country --> 1
* Soil_type --> 7

---
**Decision**

*   **Country** --> drop
*   **Mutual Information method** --> This method measures how much information a categorical feature gives about Yield. <br> MI close to 0 - feature not helpful <br> MI high - feature contains predictive power


---
It calculates a smoothed target mean:

Encoded value = weighted average of:

The global mean of the target

The mean target for that specific category (using previous rows only)




As per the detailed analysis done below are the <br>
Feature Keep/Drop Reason Recommended Encoding <br>
Crop (55) Target / Frequency Encoding <br>
State (29) One-hot (tree models) or Target (linear) <br>
Season (6) One-hot <br>
Soil Type (7) One-hot

In [None]:
# Mutual Information
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder

X= yield_df_2[['Crop', 'Season', 'State', 'Soil_type']]
y= yield_df_2.Yield

# Encode temporarily
encoded= X.apply(LabelEncoder().fit_transform)

mi= mutual_info_regression(encoded, y)
mi_table= pd.Series(mi, index= X.columns).sort_values(ascending= False)
print(mi_table)

In [None]:
# Encoding Crop & State variable using CatBoost Encoder
print('Unique count of Crop :{}'.format(yield_df_2.Crop.nunique()))
print(yield_df_2.Crop.unique())
print('Unique count of State :{}'.format(yield_df_2.State.nunique()))

In [None]:
yield_df_2.select_dtypes(include= 'object').columns

In [None]:
!pip install category_encoders

In [None]:
# from category_encoders import CatBoostEncoder

# enc= CatBoostEncoder(cols=['Crop'])
# df_enc= yield_df_2[['Crop','Yield']]
# df_enc['cropenc']= enc.fit_transform(yield_df_2['Crop'], yield_df_2['Yield'])

In [None]:
from category_encoders import CatBoostEncoder

encoder= CatBoostEncoder(cols= ['State', 'Crop'])
df_encoded= yield_df_2.copy()
df_encoded[['Crop', 'State']]= encoder.fit_transform(yield_df_2[['Crop', 'State']], yield_df_2['Yield'])

df_encoded.head()

In [None]:
# OHE for Season
# df_encoded.Season= df_encoded.Season.str.strip()

# season_dummies= pd.get_dummies(df_encoded.Season, prefix='Season', drop_first=False, dtype= 'int64')
# df_encoded= pd.concat([df_encoded, season_dummies], axis= 1)
# df_encoded.drop('Season', axis=1, inplace= True)

# use sklearn library

In [None]:
# drop first category to avoid dummy trap
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform
df_encoded.Season= df_encoded.Season.str.strip()
season_encoded= ohe.fit_transform(df_encoded[['Season']])
season_cols = [f"Season_{cat}" for cat in ohe.categories_[0][1:]]  # skip first category
season_cols
season_df = pd.DataFrame(season_encoded, columns=season_cols, index=df_encoded.index)
df_encoded = pd.concat([df_encoded.drop(columns=['Season']), season_df], axis=1)

In [None]:
df_encoded.sample(3)

**Encoding Decisions**

* **CatBoostEncoder** used to encode categorical variables with high cardinality - **State and Crop**
* CatBoost Encoder performs "ordered target encoding" (no leakage).
* **One Hot encoding** for **Season**
* **Soil_type** has very l**ess mutual information (0.09769). Can be dropped**

* Features to be dropped - Crop_Year, Country, Soil_type

---
dummy variable trap - high multicollinearity if all dummy variables are kept (Linear, ridge, lasso) <br>
keep all dummy columns - tree based models, not affected, sometimes trees learn better split


In [None]:
df_encoded.sample(3)

#### 4) **Feature Reduction**

In [None]:
df_encoded.sample(3)

**Findings**

As per the detailed analysis done we have found that below features can be dropped.

*   **Crop_Year** - shows no linear relationship with Yield (0.003)
*   **Country** - unique value
* **Soil_type** - mutual information (MI) with target variable is almost 0
* Monthwise temp features (**Jan_temp .. Dec_temp**) - high correlation with subsequent season/period mean temperature
* Monthwise rainfall features (**JAN_rain .. DEC_rain**) - high correlation with subsequent season/period rainfall detail

In [None]:
df_encoded.columns

In [None]:
df_reduced= df_encoded.drop(columns= ['Crop_Year', 'Country', 'Soil_type', 'Jan_temp', 'Feb_temp',
                          'Mar_temp', 'Apr_temp', 'May_temp', 'Jun_temp', 'Jul_temp',
                          'Aug_temp', 'Sep_temp', 'Oct_temp', 'Nov_temp', 'Dec_temp',
                          'JAN_rain', 'FEB_rain', 'MAR_rain', 'APR_rain', 'MAY_rain',
                          'JUN_rain', 'JUL_rain', 'AUG_rain', 'SEP_rain', 'OCT_rain',
                          'NOV_rain', 'DEC_rain'
                        ])
df_reduced.rename(columns= {'Season_Whole Year': 'Season_Whole_Year'}, inplace= True)

df_reduced.columns

In [None]:
# Dataset information
df_reduced.info()

In [None]:
plt.figure(figsize= (15,12))
sns.heatmap(df_reduced.corr(), annot= True, cmap= 'coolwarm', fmt= '.2f')
plt.title('Correlation Heatmap')
plt.show()

**Findings**

Dataset has only numeric features <br>

After feature reduction, there exists seasonal temperatures and annual mean temperature, monsoon rainfall and annual rainfall, pesticide and area, fertilizer and area with correlation values greater than 0.9

**Decision**

The dataset to be handled at modeling level

* Keep all the features for tree based models - not affected by multicollinearity
* Reduce features for linear-based models - drop highly correlated siblings


#### 5) **Standardization**

In [None]:
df_reduced.sample(3)

In [None]:
df_reduced.describe().T

**Conclusion**

*   Standardization to be done after splitting the dataset into X_train & X_test
*   Reason - Features have different scales (mm, °C, etc)
*  Standarization handles outliers better than normalization
*  Values are not bounded and can take both positive/negative ranges

---

Linear models requires scaled data <br>
Tree-based models can be applied on unscaled data





In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# X= df_reduced.drop(['Yield'], axis= 1)
# y= df_reduced.Yield

# Split
# X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.20, random_state= 42)

# Initialize
scaler= StandardScaler()

# Transform
# X_train_scaled= scaler.fit_transform(X_train)
# X_test_scaled= scaler.fit_transform(X_test)

#### 6) **Feature Engineering**

In [None]:
df_reduced.sample(3)

In [None]:
df_reduced.columns

**Input based features**

1. Fertilizer per area
2. Pesticide per area

These are meaningful because input intensity matters more than absolute quantity

In [None]:
# fertilzer_per_area
df_reduced['Fertilizer_per_area']= df_reduced.Fertilizer / df_reduced.Area

# pesticide per area
df_reduced['Pesticide_per_area']= df_reduced.Pesticide / df_reduced.Area

In [None]:
# Correlation with Yield
df_reduced[['Fertilizer_per_area', 'Pesticide_per_area', 'Yield']].corr()

In [None]:
X= df_reduced[['Fertilizer_per_area', 'Pesticide_per_area']]
y= df_reduced.Yield

mi= mutual_info_regression(X, y)
mi

# Model Building

In [None]:
df_reduced.sample(2)

In [None]:
df_reduced.columns

A pipeline bundles steps like preprocessing, feature transformations and model building into one step.<br>

It removes messiness, protects you from leakage, and makes model comparison simple and elegant.

Metrics
RMSE - A higher root mean square error indicates a greater error in prediction.

A higher R-squared (\(R^{2}\)) score indicates a lower error in prediction
A lower R-squared score indicates a higher error in prediction.

In [None]:
X= df_reduced.drop(columns= ['Yield'], axis= 1)
y= df_reduced.Yield

# Split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.20, random_state= 29)

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Pipelines for models that need scaline
scaled_models= {
    'LinearRegression': LinearRegression(),
    'Ridge' : Ridge(alpha= 1.0),
    'Lasso' : Lasso(alpha= 0.01)
}

# Create the Pipeline
pipelines = {
    name: Pipeline ([
    ('scaler', StandardScaler()),
    ('model' , model)
    ])
    for name, model in scaled_models.items()
    }

results = {}

for name, pipe in pipelines.items():
    # Fit the pipeline on training data
    pipe.fit(X_train, y_train)
    # Predict on test data
    preds= pipe.predict(X_test)

    # Evaluate the model
    train_score= round(pipe.score(X_train, y_train), 4)
    test_score= round(pipe.score(X_test, y_test), 4)

    mse= mean_squared_error(y_test, preds)
    rmse= round(np.sqrt(mse), 4)
    r2= round(r2_score(y_test, preds), 4)

    results[name] = {'Train score':train_score, 'Test score': test_score, 'RMSE': rmse, 'R2': r2}


results_df= pd.DataFrame(results).T
print(results_df)

In [None]:
df_reduced.columns

In [None]:
# Area, Monsoon_temp, 'Post_Monsoon_temp', 'Summer_temp', 'Winter_temp', 'Jan-Feb_rain',
      #  'Mar-May_rain', 'Jun-Sep_rain', 'Oct-Dec_rain',
      # 'Fertilizer_per_area', 'Pesticide_per_area'

df_temp= df_reduced[['Crop', 'Area', 'State', 'Production', 'Annual_Rainfall', 'Fertilizer',
       'Pesticide', 'Yield', 'Annual_mean_temp', 'Season_Kharif',
       'Season_Rabi', 'Season_Summer', 'Season_Whole_Year', 'Season_Winter'
       ]]

df_temp.columns

In [None]:
# df_temp.to_csv('yield_file_for_model.csv')

In [None]:
# df_temp - Dropped hightly correlated features

X_temp= df_temp.drop(columns= ['Yield'], axis= 1)
y_temp= df_temp.Yield

# Split
X_train_te, X_test_te, y_train_te, y_test_te= train_test_split(X_temp, y_temp, test_size= 0.20, \
                                                               random_state= 29)

# results_te = {}

# for name, pipe in pipelines.items():
#     # Fit the pipeline on training data
#     pipe.fit(X_train_te, y_train_te)
#     # Predict on test data
#     preds= pipe.predict(X_test_te)

#     # Evaluate the model
#     train_score= round(pipe.score(X_train_te, y_train_te), 4)
#     test_score= round(pipe.score(X_test_te, y_test_te), 4)

#     mse= mean_squared_error(y_test_te, preds)
#     rmse= round(np.sqrt(mse), 4)
#     r2= round(r2_score(y_test_te, preds), 4)

#     results_te[name] = {'Train score':train_score, 'Test score': test_score, 'RMSE': rmse, 'R2': r2}


# results_df_te= pd.DataFrame(results_te).T
# print(results_df_te)


In [115]:
# Fine tuning Ridge
from sklearn.model_selection import GridSearchCV, KFold

ridge_pipe= Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# hyper-parameter grid
param_grid = {
    'ridge__alpha': np.logspace(-5, 5, 30)
  }

# cross validation
cv= KFold(n_splits= 5, shuffle= True, random_state= 29)

# Grid Search
ridge_gs= GridSearchCV( estimator= ridge_pipe, param_grid= param_grid, cv= cv,
                       scoring= 'r2', n_jobs= -1, verbose= 1)

ridge_gs.fit(X_train, y_train)

print('Best R2 score:', ridge_gs.best_score_)
print("Best Alpha:", ridge_gs.best_params_['ridge__alpha'])

best_ridge_model= ridge_gs.best_estimator_

Best R2 score: 0.8121240495188168
Best Alpha: 174.33288221999908


In [116]:
ridge_gs.best_estimator_

The optimal Ridge regularization strength was α = 174.332, indicating that the dataset required substantial regularization due to multicollinearity and variability across features. Ridge helped stabilize the model and improved generalization compared to ordinary linear regression.



In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Decision Tree
pipe_dt = Pipeline([
    ('model', DecisionTreeRegressor())
])

# Random Forest
pipe_rf = Pipeline([
    ('model', RandomForestRegressor())
])

# Gradient Boosting
pipe_gb = Pipeline([
    ('model', GradientBoostingRegressor())
])

param_dt = {
    'model__max_depth': [None, 5, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}

param_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
}

param_gb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__max_depth': [2, 3, 4]
}

gs_dt = GridSearchCV(pipe_dt, param_dt, cv=5, scoring='r2', n_jobs=-1)
gs_rf = GridSearchCV(pipe_rf, param_rf, cv=5, scoring='r2', n_jobs=-1)
gs_gb = GridSearchCV(pipe_gb, param_gb, cv=5, scoring='r2', n_jobs=-1)

# gs_dt.fit(X_train, y_train)
gs_rf.fit(X_train, y_train)
# gs_gb.fit(X_train, y_train)

# -------------------------------------------
# Evaluate each model
# -------------------------------------------
models = {
    # "Decision Tree": gs_dt,
    "Random Forest": gs_rf
    # ,
    # "Gradient Boosting": gs_gb
}

for name, gs in models.items():
    y_pred = gs.predict(X_test)
    print(f"\n{name}")
    print("Best Params:", gs.best_params_)
    print("Train R2:", gs.score(X_train, y_train))
    print("Test R2:", r2_score(y_test, y_pred))

Random Forest
Best Params: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Train R2: 0.9914328378355629
Test R2: 0.9734970092604418

Tree-based models can overfit, but when the test score stays close like this, it suggests:

Your features are very predictive

Relationships in data are strong

Noise is limited

The tree model wasn’t forced into chaotic depth

Your hyperparameters are working for you

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Random Forest
pipe_rf = Pipeline([
    ('model', RandomForestRegressor())
])

param_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
}

gs_rf = GridSearchCV(pipe_rf, param_rf, cv=5, scoring='r2', n_jobs=-1)

gs_rf.fit(X_train_te, y_train_te)

y_pred_te= gs_rf.predict(X_test_te)
print("Best Params:", gs_rf.best_params_)
print("Train R2:", gs_rf.score(X_train_te, y_train_te))
print("Test R2:", r2_score(y_test_te, y_pred_te))

Best Params: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Train R2: 0.9927504975957453
Test R2: 0.983405186164715

As far as model behaviour goes: Yes.
It’s perfectly reasonable and statistically healthy.

Your model is:

Learning real structure

Not memorising noise

Benefiting from reduced multicollinearity

Simpler, cleaner, and more stable

In [None]:
# best_rf= gs_rf.best_estimator_
# rf_model= best_rf.named_steps['model']

# importances= rf_model.feature_importances_
# features= X_train_te.columns

# feat_imp_df= pd.DataFrame(
#     {
#         'Feature': features,
#         'Importance' : importances
#     }
# ).sort_values(by= 'Importance', ascending= False)

# print(feat_imp_df)



# Model Building for UI

In [None]:
df_reduced.columns

In [None]:
df_temp1= df_reduced[['Crop','Area','Production','Annual_Rainfall', 'Fertilizer',
                   'Pesticide','Yield','Annual_mean_temp']]
df_temp1.columns

In [None]:
df_temp1.sample(3)

In [None]:
X_temp1= df_temp1.drop(columns= ['Yield'], axis= 1)
y_temp1= df_temp1.Yield

# Split
X_train_te1, X_test_te1, y_train_te1, y_test_te1= train_test_split(X_temp1, y_temp1, test_size= 0.20, \
                                                               random_state= 29)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Random Forest
pipe_rf = Pipeline([
    ('model', RandomForestRegressor())
])

param_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
}

gs_rf1 = GridSearchCV(pipe_rf, param_rf, cv=5, scoring='r2', n_jobs=-1)

gs_rf1.fit(X_train_te1, y_train_te1)

y_pred_te1= gs_rf1.predict(X_test_te1)
print("Best Params:", gs_rf1.best_params_)
print("Train R2:", gs_rf1.score(X_train_te1, y_train_te1))
print("Test R2:", r2_score(y_test_te1, y_pred_te1))

In [None]:
gs_rf1.predict([[1.69316,3487000.00000,1357500,5857.90000,331857790.00000,1080970,26.55]])

# 1.69316	3487000.00000	1357500	5857.90000	331857790.00000	1080970.00000	0.44208	26.55000

In [None]:
df_temp1.to_csv('yield_file_for_model1.csv')