In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
import xgboost

In [2]:
#!pip install xgboost

In [3]:
dataset = pd.read_csv("dataset_preprocess_ds.csv", index_col=None)
df2 = dataset
df2 = pd.get_dummies(df2, drop_first=True)

indep_X = df2.drop(['Crop/Year'], axis=1)
dep_Y = df2['Production_Tonnes']

In [4]:
dataset.columns

Index(['Crop/Year', 'Area_Hectares', 'Production_Tonnes', 'District_Name_BEED',
       'District_Name_HINGOLI', 'District_Name_JALNA', 'District_Name_LATUR',
       'District_Name_NANDED', 'District_Name_OSMANABAD',
       'Winter/Summer_Rabi       ', 'Winter/Summer_Summer     ',
       'Winter/Summer_Whole Year ', 'Rice_Bajra', 'Rice_Castor seed',
       'Rice_Cotton(lint)', 'Rice_Gram', 'Rice_Groundnut', 'Rice_Jowar',
       'Rice_Linseed', 'Rice_Maize', 'Rice_Moong(Green Gram)',
       'Rice_Niger seed', 'Rice_Other  Rabi pulses',
       'Rice_Other Kharif pulses', 'Rice_Rice', 'Rice_Safflower',
       'Rice_Sesamum', 'Rice_Soyabean', 'Rice_Sugarcane', 'Rice_Sunflower',
       'Rice_Wheat'],
      dtype='object')

In [5]:
indep_X = dataset.drop(['Crop/Year', 'Area_Hectares', 'Production_Tonnes'], axis=1)
dep_Y = dataset['Rice_Sesamum']

In [6]:
#Feature Selection

In [7]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [8]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [9]:
def Linear(X_train,y_train,X_test):       
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [10]:
def Decision(X_train,y_train,X_test):
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [11]:
def random(X_train,y_train,X_test):       
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [12]:
def xgboost(X_train,y_train,X_test):       
        from xgboost import XGBRegressor
        regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return r2

In [13]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []  
    # List to store column names for each model
    r2_values = []  
    # List to store R2 values for each model

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature), dep_Y)
        model.fit(X_train, y_train)  # Fit the model
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: Linear
Selected Columns: ['Rice_Castor seed', 'Rice_Cotton(lint)', 'Rice_Maize', 'Rice_Safflower', 'Rice_Sesamum']
R2 Value: 0.0

Model: Decision
Selected Columns: ['Rice_Sesamum', 'Rice_Soyabean', 'Rice_Sugarcane', 'Rice_Sunflower', 'Rice_Wheat']
R2 Value: 1.0

Model: Random
Selected Columns: ['Rice_Sesamum', 'Rice_Soyabean', 'Rice_Sugarcane', 'Rice_Sunflower', 'Rice_Wheat']
R2 Value: 1.0

Model: XGBoost
Selected Columns: ['Rice_Sesamum', 'Rice_Soyabean', 'Rice_Sugarcane', 'Rice_Sunflower', 'Rice_Wheat']
R2 Value: 0.0

