In [717]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [718]:
df = pd.read_csv('Crop_Yield_District_wise_Dataset.csv')

In [719]:
df.head()

Unnamed: 0,N,P,K,pH,Humidity,Temperature,Rainfall,CropYield,District,SoilType,CropName
0,55,25,40,6.3,80,24,800,920,Dakshin Dinajpur,Alluvial Soil,Jute
1,52,22,38,6.2,79,23,780,900,Dakshin Dinajpur,Alluvial Soil,Jute
2,56,26,42,6.4,81,25,820,940,Dakshin Dinajpur,Alluvial Soil,Jute
3,53,23,39,6.1,78,26,790,910,Dakshin Dinajpur,Alluvial Soil,Jute
4,54,24,41,6.5,82,24,810,930,Dakshin Dinajpur,Alluvial Soil,Jute


In [720]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            1024 non-null   int64  
 1   P            1024 non-null   int64  
 2   K            1024 non-null   int64  
 3   pH           1024 non-null   float64
 4   Humidity     1024 non-null   int64  
 5   Temperature  1024 non-null   int64  
 6   Rainfall     1024 non-null   int64  
 7   CropYield    1024 non-null   int64  
 8   District     1024 non-null   object 
 9   SoilType     1024 non-null   object 
 10  CropName     1024 non-null   object 
dtypes: float64(1), int64(7), object(3)
memory usage: 88.1+ KB


In [721]:
df.shape

(1024, 11)

In [722]:
df.isnull()

Unnamed: 0,N,P,K,pH,Humidity,Temperature,Rainfall,CropYield,District,SoilType,CropName
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1019,False,False,False,False,False,False,False,False,False,False,False
1020,False,False,False,False,False,False,False,False,False,False,False
1021,False,False,False,False,False,False,False,False,False,False,False
1022,False,False,False,False,False,False,False,False,False,False,False


In [723]:
df.duplicated().sum()

20

In [724]:
df=df.drop_duplicates()

In [725]:
df.duplicated().sum()

0

In [726]:
df.head()


Unnamed: 0,N,P,K,pH,Humidity,Temperature,Rainfall,CropYield,District,SoilType,CropName
0,55,25,40,6.3,80,24,800,920,Dakshin Dinajpur,Alluvial Soil,Jute
1,52,22,38,6.2,79,23,780,900,Dakshin Dinajpur,Alluvial Soil,Jute
2,56,26,42,6.4,81,25,820,940,Dakshin Dinajpur,Alluvial Soil,Jute
3,53,23,39,6.1,78,26,790,910,Dakshin Dinajpur,Alluvial Soil,Jute
4,54,24,41,6.5,82,24,810,930,Dakshin Dinajpur,Alluvial Soil,Jute


In [727]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1004 entries, 0 to 1023
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            1004 non-null   int64  
 1   P            1004 non-null   int64  
 2   K            1004 non-null   int64  
 3   pH           1004 non-null   float64
 4   Humidity     1004 non-null   int64  
 5   Temperature  1004 non-null   int64  
 6   Rainfall     1004 non-null   int64  
 7   CropYield    1004 non-null   int64  
 8   District     1004 non-null   object 
 9   SoilType     1004 non-null   object 
 10  CropName     1004 non-null   object 
dtypes: float64(1), int64(7), object(3)
memory usage: 94.1+ KB


In [728]:
df = pd.read_csv('Crop_Yield_District_wise_Dataset.csv')

In [729]:
# # Define features and target variables for crop prediction
# features_for_crop = ['N', 'P', 'K', 'pH', 'Humidity', 'Temperature', 'Rainfall', 'District', 'SoilType']
# X_crop = df[features_for_crop]
# y_crop = df['CropName']
# Define features and target variables for crop prediction
# features_for_crop = ['N', 'P', 'K', 'pH', 'Humidity', 'Temperature', 'Rainfall', 'District', 'SoilType']
# X_crop = df[features_for_crop]
# y_crop = df['CropName']

# # Define features and target variables for yield prediction, now including 'CropName'
# features_for_yield = features_for_crop + ['CropName']
# X_yield = df[features_for_yield]
# y_yield = df['CropYield']

# Define features and target variables
features = ['N', 'P', 'K', 'pH', 'Humidity', 'Temperature', 'Rainfall', 'District', 'SoilType']
X = df[features]
y_crop = df['CropName']
y_yield = df['CropYield']

In [730]:
# Preprocessing pipeline for numeric and categorical features
numeric_features = ['N', 'P', 'K', 'pH', 'Humidity', 'Temperature', 'Rainfall']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['District', 'SoilType', 'CropName']  # Include 'CropName' for yield model
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_for_crop = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, ['District', 'SoilType'])])  # Exclude 'CropName' for crop model

preprocessor_for_yield = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])  # Include 'CropName' for yield model


In [731]:
# Crop prediction model
model_crop = Pipeline([
    ('preprocessor', preprocessor_for_crop),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [732]:
# Yield prediction model including 'CropName'
model_yield = Pipeline([
    ('preprocessor', preprocessor_for_yield),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [733]:
# # Splitting the dataset into training and testing sets for both models
# X_train_crop, X_test_crop, y_train_crop, y_test_crop = train_test_split(X_crop, y_crop, test_size=0.2, random_state=42)
# X_train_yield, X_test_yield, y_train_yield, y_test_yield = train_test_split(X_yield, y_yield, test_size=0.2, random_state=42)

# Splitting the dataset into training and testing sets for both models
X_train_crop, X_test_crop, y_train_crop, y_test_crop = train_test_split(X_crop, y_crop, test_size=0.2, random_state=42)
X_train_yield, X_test_yield, y_train_yield, y_test_yield = train_test_split(X_yield, y_yield, test_size=0.2, random_state=42)

In [734]:
# Training the models
model_crop.fit(X_train_crop, y_train_crop)
model_yield.fit(X_train_yield, y_train_yield)

In [735]:
def predict_with_missing_data(input_data, top_n=2):
    sample_input_crop = pd.DataFrame([input_data])
    # Predict the probability of each crop
    crop_probabilities = model_crop.predict_proba(sample_input_crop)[0]
    top_crop_indices = np.argsort(crop_probabilities)[::-1][:top_n]
    top_crops = np.array(model_crop.classes_)[top_crop_indices]
    
    top_crops_with_yield = []
    for crop in top_crops:
        # Adjust input data to include the predicted 'CropName' for yield prediction
        input_data_with_crop = input_data.copy()
        input_data_with_crop['CropName'] = crop
        sample_input_yield = pd.DataFrame([input_data_with_crop])
        
        yield_prediction = model_yield.predict(sample_input_yield)[0]
        top_crops_with_yield.append((crop, yield_prediction))
    
    return top_crops_with_yield

In [736]:
#Example input with some missing values (ensure to fill all required fields)
input_data = {
    'N': 55, 'P': 25, 'K': 40, 'pH': 6.3, 'Humidity': 80, 'Temperature': 24, 'Rainfall': 800,
    'District': 'Dakshin Dinajpur', 'SoilType': 'Alluvial Soil'
}


In [737]:
# Making predictions with the function
top_crops_with_yield = predict_with_missing_data(input_data, top_n=2)

In [738]:
# # Displaying the predictions
# for crop, yield_prediction in top_crops_with_yield:
#     print(f"Crop: {crop}, Expected Yield: {yield_prediction}")

# # Making predictions with the function
# top_crops_with_yield = predict_with_missing_data(input_data, top_n=2)

# Displaying the predictions with explicit mention of the best fit and alternatives
print(f"Best Fit Crop: {top_crops_with_yield[0][0]}, Expected Yield: {top_crops_with_yield[0][1]}")
for i, (crop, yield_prediction) in enumerate(top_crops_with_yield[1:], start=1):
    print(f"Alternative Crop {i}: {crop}, Expected Yield: {yield_prediction}")


Best Fit Crop: Jute, Expected Yield: 915.2
Alternative Crop 1: Cucumbers, Expected Yield: 904.9
