In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [2]:
# Load data
Crop_ds = pd.read_csv("crop_preprocess_ds.csv")
Crop_ds
indep_X = Crop_ds.drop('Yield/unitArea', axis=1)
dep_Y = Crop_ds['Yield/unitArea']

In [None]:
##crop_counts = Crop_ds['Crop'].value_counts()
#Crop_ds['crop_freq_encode'] = Crop_ds['Crop'].map(crop_counts)

In [None]:
# Frequency encoding for 'states' column
#state_counts = Crop_ds['State'].value_counts()
#Crop_ds['state_freq_encode'] = Crop_ds['State'].map(state_counts)

In [None]:
#season_counts=Crop_ds['Season'].value_counts()
#Crop_ds['season_freq_encode']=Crop_ds['Season'].map(season_counts)

In [None]:
#Crop_ds.drop(['Crop', 'State','Season'], axis=1, inplace=True)

In [3]:
Crop_ds

Unnamed: 0,Crop_Year,Area_Hectares,Production_Tonnes,Rainfall_mm,Fertilizer_Kgms,Pesticide_Kgms,Crop_Arhar/Tur,Crop_Bajra,Crop_Banana,Crop_Barley,...,State_Puducherry,State_Punjab,State_Sikkim,State_Tamil Nadu,State_Telangana,State_Tripura,State_Uttar Pradesh,State_Uttarakhand,State_West Bengal,Yield/unitArea
0,1997,73814.0,56708,2051.4,7024878.38,22882.34,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.796087
1,1997,6637.0,4685,2051.4,631643.29,2057.47,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.710435
2,1997,796.0,22,2051.4,75755.32,246.76,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.238333
3,1997,19656.0,309194,2051.4,1870661.52,6093.36,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.118226
4,1997,1739.0,794,2051.4,165500.63,539.09,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.420909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19568,1998,4000.0,2000,1498.0,395200.00,1160.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.500000
19569,1998,1000.0,3000,1498.0,98800.00,290.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.000000
19570,1997,187823.5,309194,1356.2,25054187.10,49993.35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.285000
19571,1997,187823.5,5488,1356.2,25054187.10,49993.35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.016667


In [None]:
indep_X

In [None]:
dep_Y

In [4]:
# Split data and standardize
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# PCA
pca = PCA(n_components=12)
X_train_PCA = pca.fit_transform(X_train_scaled)
X_test_PCA = pca.transform(X_test_scaled)

In [6]:
# Model Evaluation Functions
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

# Model Training Functions
def Decision(X_train, y_train, X_test, y_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def random(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def gradient(X_train, y_train, X_test, y_test):
    regressor = GradientBoostingRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2



In [7]:
acc_decision = Decision(X_train_PCA, y_train, X_test_PCA, y_test)
acc_random = random(X_train_PCA, y_train, X_test_PCA, y_test)
acc_gradient = gradient(X_train_PCA, y_train, X_test_PCA, y_test)

In [8]:
# Display results
result = pd.DataFrame(index=['n=12'], columns=['DecisionTree', 'RandomForest', 'GradientBoosting'])
result.loc['n=12', 'DecisionTree'] = acc_decision
result.loc['n=12', 'RandomForest'] = acc_random
result.loc['n=12', 'GradientBoosting'] = acc_gradient


In [9]:
print(result)

     DecisionTree RandomForest GradientBoosting
n=12     0.707988     0.842753         0.626578


In [10]:
X_train_PCA.shape

(14679, 12)

In [11]:
y_train.shape

(14679,)

In [12]:
components = pca.components_

In [13]:
components_df = pd.DataFrame(components, columns=X_train.columns)

In [14]:
components_df

Unnamed: 0,Crop_Year,Area_Hectares,Production_Tonnes,Rainfall_mm,Fertilizer_Kgms,Pesticide_Kgms,Crop_Arhar/Tur,Crop_Bajra,Crop_Banana,Crop_Barley,...,State_Odisha,State_Puducherry,State_Punjab,State_Sikkim,State_Tamil Nadu,State_Telangana,State_Tripura,State_Uttar Pradesh,State_Uttarakhand,State_West Bengal
0,-0.031664,0.463473,0.418637,-0.184927,0.463146,0.46153,0.011886,0.032211,-0.003794,-0.016537,...,-0.00619,-0.067031,0.026364,-0.042428,0.032088,-0.009163,-0.05079,0.074967,-0.044748,-0.011519
1,-0.094436,0.040786,0.15737,0.322851,0.037857,0.03798,-0.06405,-0.069223,0.152777,-0.067626,...,0.011332,0.001013,-0.100411,0.001816,0.067819,-0.077733,-0.017854,-0.075655,-0.067138,0.027652
2,0.025847,0.010621,0.015693,-0.021908,0.013883,0.010889,-0.082626,-0.09621,-0.016072,0.108699,...,-0.025737,-0.043154,0.012924,-0.021409,-0.024266,0.011738,-0.00249,0.019472,0.010201,0.048808
3,0.021625,0.060143,0.013127,0.489721,0.060955,0.062998,0.058259,-0.080398,-0.061226,-0.039354,...,0.148858,-0.047789,-0.079379,0.11956,-0.198583,-0.028503,0.188038,-0.076634,0.001846,0.107112
4,0.071083,-0.052009,-0.032832,-0.11936,-0.049235,-0.050061,-0.117548,0.006165,0.012604,-0.059989,...,0.485994,0.130332,-0.023204,-0.096095,-0.037822,0.033356,-0.07141,0.045454,-0.002867,0.062381
5,0.308929,-0.002663,-0.001093,0.063993,0.014004,0.00965,0.018433,-0.004366,0.036064,-0.121622,...,-0.218336,0.187108,-0.048658,-0.040117,0.001745,0.239719,0.039351,0.09572,0.112027,-0.168297
6,0.055941,-0.006479,0.025675,0.028917,-0.001826,-0.003502,-0.070797,0.039883,-0.011911,0.092315,...,-0.016718,-0.220734,-0.116565,0.11529,-0.047131,-0.026191,-0.092723,0.204269,0.25856,0.014185
7,0.090751,0.001629,-0.016939,0.021828,0.007396,0.005056,-0.012372,-0.072323,-0.148974,0.00245,...,-0.137753,0.009487,-0.040843,-0.18143,-0.047374,0.083915,0.094421,0.116025,-0.162186,0.227328
8,-0.225629,-0.03606,0.059603,-0.006169,-0.048398,-0.045652,0.029549,0.129254,0.15762,-0.090371,...,-0.090348,0.02293,0.102918,-0.105523,-0.204277,-0.22632,-0.059965,0.238751,0.063219,-0.024668
9,0.068406,-0.040505,0.064956,-0.03382,-0.037937,-0.037665,0.077102,0.070792,-0.018498,0.032138,...,-0.045782,0.013142,-0.065508,-0.03628,0.049998,0.113759,0.019435,-0.059025,-0.220446,-0.106323


In [15]:
top_features_indices = [component.argsort()[-2:][::-1] for component in components]  # Assuming you want the top 3 contributing features

In [16]:
top_features_indices

[array([1, 4], dtype=int64),
 array([63,  3], dtype=int64),
 array([61, 58], dtype=int64),
 array([ 3, 81], dtype=int64),
 array([84, 62], dtype=int64),
 array([ 0, 62], dtype=int64),
 array([65, 35], dtype=int64),
 array([65, 35], dtype=int64),
 array([66, 91], dtype=int64),
 array([69, 25], dtype=int64),
 array([86, 22], dtype=int64),
 array([71, 35], dtype=int64)]

In [17]:
selected_input_variables = []
for indices in top_features_indices:
    selected_input_variables.append([X_train.columns[i] for i in indices])

In [18]:
print(selected_input_variables)


[['Area_Hectares', 'Fertilizer_Kgms'], ['Season_Whole Year ', 'Rainfall_mm'], ['Season_Rabi       ', 'Crop_Wheat'], ['Rainfall_mm', 'State_Meghalaya'], ['State_Odisha', 'Season_Summer     '], ['Crop_Year', 'Season_Summer     '], ['State_Arunachal Pradesh', 'Crop_Oilseeds total'], ['State_Arunachal Pradesh', 'Crop_Oilseeds total'], ['State_Assam', 'State_Uttar Pradesh'], ['State_Delhi', 'Crop_Jowar'], ['State_Punjab', 'Crop_Groundnut'], ['State_Gujarat', 'Crop_Oilseeds total']]


In [19]:
from collections import Counter
flattened_features = [feature for sublist in selected_input_variables for feature in sublist]
feature_counts = Counter(flattened_features)
top_features = feature_counts.most_common(5)  # Change 5 to the desired number of top features
top_feature_names = [feature[0] for feature in top_features]
print(top_feature_names)


['Crop_Oilseeds total', 'Rainfall_mm', 'Season_Summer     ', 'State_Arunachal Pradesh', 'Area_Hectares']


In [20]:
X_train_selected = X_train[top_feature_names]


In [21]:
X_train_selected

Unnamed: 0,Crop_Oilseeds total,Rainfall_mm,Season_Summer,State_Arunachal Pradesh,Area_Hectares
4696,0,1201.9,0,0,187823.5
4676,0,1201.9,0,0,19796.0
14251,0,1223.2,0,0,8326.0
13356,0,1348.7,0,0,49638.0
17037,0,1031.7,0,0,63.0
...,...,...,...,...,...
9225,0,907.8,1,0,163458.0
13123,0,1216.9,0,0,3408.0
9845,0,1362.5,0,0,1000.0
10799,0,891.6,0,0,187823.5


In [22]:
X_train_selected.shape

(14679, 5)

In [23]:
X_test_selected = X_test[top_feature_names]


In [24]:
X_test_selected

Unnamed: 0,Crop_Oilseeds total,Rainfall_mm,Season_Summer,State_Arunachal Pradesh,Area_Hectares
19137,0,1503.4,0,0,5786.0
5027,0,912.4,0,0,76420.0
11685,0,1771.8,0,0,27050.0
12493,0,1366.8,0,0,101457.0
763,0,928.4,0,0,104244.0
...,...,...,...,...,...
15305,0,1112.0,0,0,8587.0
15132,0,965.3,0,0,187823.5
9355,0,516.9,0,0,127.0
6876,0,1484.3,0,0,187823.5


In [25]:
y_train.shape

(14679,)

In [28]:
from sklearn.model_selection import GridSearchCV

In [26]:

def grid_search_models(X_train_selected, y_train):
    models = {
        "RandomForestRegressor": (RandomForestRegressor(), {
            'criterion': ['squared_error', 'absolute_error'],
            'max_features': ['sqrt', 'log2'],
            'n_estimators': [10, 100]
        }),
        "DecisionTreeRegressor": (DecisionTreeRegressor(), {
            'criterion': ['squared_error', 'absolute_error'],
            'splitter':['best','random'],
            'max_features': ['sqrt','log2']
        })
        #"GradientBoostingRegressor":(GradientBoostingRegressor(), {
          #'criterion': ['friedman_mse', 'squared_error'],
            #'n_estimators':[10,100]
        #})
    }

    results = {}
    
    for model_name, (model, param_grid) in models.items():
        grid_search = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)  
        
        grid_search.fit(X_train_selected, y_train)
        
        results[model_name] = {
            "Best parameters": grid_search.best_params_,
            "Best score": grid_search.best_score_
        }

    return results


In [29]:
results = grid_search_models(X_train_selected,y_train)
print(results)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'RandomForestRegressor': {'Best parameters': {'criterion': 'absolute_error', 'max_features': 'sqrt', 'n_estimators': 100}, 'Best score': -0.05687288988146495}, 'DecisionTreeRegressor': {'Best parameters': {'criterion': 'absolute_error', 'max_features': 'log2', 'splitter': 'random'}, 'Best score': -0.6956751862174293}}


In [None]:
best_model_name, best_score = max((model_name, results[model_name]['Best score']) for model_name in results)

In [None]:
best_model_name, best_score

In [None]:
best_params = results[best_model_name]['Best parameters']

In [None]:
best_params

In [None]:
X_train_selected.shape

In [None]:
y_train.shape

In [None]:
X_test_selected.shape

In [None]:
y_test.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_PCA, y_train, test_size=0.25, random_state=42)

In [None]:
best_model=RandomForestRegressor(**best_params)
best_model.fit(X_train_selected,y_train)