In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("C:\\Users\\rushi\\Desktop\\FertilizeWise\\FertWiseMergedMain.csv")
# First, let's analyze the class distribution
print("Class distribution in target variable:")
print(df['Fertilizer'].value_counts())
print("\nPercentage distribution:")
print(df['Fertilizer'].value_counts(normalize=True) * 100)

# First, identify categorical and numerical columns
categorical_columns = ['Soil_Type', 'Crop_Type']
numerical_columns = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)
    ])

# Prepare X and y
le = LabelEncoder()
y = le.fit_transform(df['Fertilizer'])
X = df[numerical_columns + categorical_columns]

# Split the data using stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models with class weight consideration
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, scale_pos_weight=1),
    'SVC': SVC(kernel='rbf', class_weight='balanced')
}

# Define custom cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train and evaluate each model
results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # Calculate precision, recall, and F1 score for each class
        precision, recall, f1, support = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
        
        # Perform cross-validation
        cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
        
        results[name] = {
            'Train Accuracy': train_accuracy,
            'Test Accuracy': test_accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'CV Accuracy Mean': cv_scores.mean(),
            'CV Accuracy Std': cv_scores.std()
        }
        
        print(f"\nDetailed Classification Report for {name}:")
        print(classification_report(y_test, y_test_pred, 
                                 target_names=le.classes_,
                                 zero_division=0))
        
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        continue

# Print results
if results:
    print("\nModel Comparison Results:")
    print("-" * 140)
    print(f"{'Model':<20} {'Train Acc':>10} {'Test Acc':>10} {'Precision':>10} {'Recall':>10} "
          f"{'F1 Score':>10} {'CV Mean':>10} {'CV Std':>10}")
    print("-" * 140)

    for model_name, metrics in results.items():
        print(f"{model_name:<20} {metrics['Train Accuracy']:>10.4f} {metrics['Test Accuracy']:>10.4f} "
              f"{metrics['Precision']:>10.4f} {metrics['Recall']:>10.4f} {metrics['F1 Score']:>10.4f} "
              f"{metrics['CV Accuracy Mean']:>10.4f} {metrics['CV Accuracy Std']:>10.4f}")

    # Find and use the best model (using F1 score instead of just accuracy)
    best_model = max(results.items(), key=lambda x: x[1]['F1 Score'])
    print(f"\nBest performing model: {best_model[0]}")
    print(f"F1 Score: {best_model[1]['F1 Score']:.4f}")
    print(f"Test Accuracy: {best_model[1]['Test Accuracy']:.4f}")

    # Save predictions for the best model
    best_model_name = best_model[0]
    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', models[best_model_name])
    ])
    best_pipeline.fit(X_train, y_train)
    final_predictions = best_pipeline.predict(X_test)

    # Convert numerical predictions back to fertilizer names
    final_predictions_labels = le.inverse_transform(final_predictions)
    y_test_labels = le.inverse_transform(y_test)

    comparison_df = pd.DataFrame({
        'Actual': y_test_labels,
        'Predicted': final_predictions_labels,
        'Correct': y_test_labels == final_predictions_labels
    })
    
    print("\nSample of Actual vs Predicted values:")
    print(comparison_df.head(10))
    
    # Print confusion matrix
    from sklearn.metrics import confusion_matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test_labels, final_predictions_labels)
    print(pd.DataFrame(cm, index=le.classes_, columns=le.classes_))

    comparison_df.to_csv('fertilizer_predictions.csv', index=False)
else:
    print("\nNo models were successfully trained! Check the error messages above.")

Class distribution in target variable:
Fertilizer
Urea                  130
DAP                   122
28-28                  85
20-20                  70
14-35-14               70
17-17-17               35
10/26/2026             35
TSP                    28
14-14-14               16
15-15-15               16
10/10/2010             16
Superphosphate         12
Potassium sulfate.     12
Potassium chloride      4
Name: count, dtype: int64

Percentage distribution:
Fertilizer
Urea                  19.969278
DAP                   18.740399
28-28                 13.056836
20-20                 10.752688
14-35-14              10.752688
17-17-17               5.376344
10/26/2026             5.376344
TSP                    4.301075
14-14-14               2.457757
15-15-15               2.457757
10/10/2010             2.457757
Superphosphate         1.843318
Potassium sulfate.     1.843318
Potassium chloride     0.614439
Name: proportion, dtype: float64

Training Logistic Regression...

Detailed

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
df = pd.read_csv("C:\\Users\\rushi\\Desktop\\FertilizeWise\\FertWiseMergedMain.csv") 
# First, identify categorical and numerical columns
categorical_columns = ['Soil_Type', 'Crop_Type']
numerical_columns = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)
    ])

# Prepare X and y
# Convert target variable to numerical using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['Fertilizer'])
X = df[numerical_columns + categorical_columns]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classification models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
    'SVC': SVC(kernel='rbf')
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        # Fit the pipeline
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # Perform cross-validation
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        
        results[name] = {
            'Train Accuracy': train_accuracy,
            'Test Accuracy': test_accuracy,
            'CV Accuracy Mean': cv_scores.mean(),
            'CV Accuracy Std': cv_scores.std()
        }
        
        print(f"Classification Report for {name}:")
        print(classification_report(y_test, y_test_pred, 
                                 target_names=le.classes_))
        
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        continue

# Print results
if results:
    print("\nModel Comparison Results:")
    print("-" * 100)
    print(f"{'Model':<20} {'Train Accuracy':>15} {'Test Accuracy':>15} "
          f"{'CV Accuracy Mean':>15} {'CV Accuracy Std':>15}")
    print("-" * 100)

    for model_name, metrics in results.items():
        print(f"{model_name:<20} {metrics['Train Accuracy']:>15.4f} "
              f"{metrics['Test Accuracy']:>15.4f} {metrics['CV Accuracy Mean']:>15.4f} "
              f"{metrics['CV Accuracy Std']:>15.4f}")

    # Find and use the best model
    best_model = max(results.items(), key=lambda x: x[1]['Test Accuracy'])
    print(f"\nBest performing model: {best_model[0]}")
    print(f"Test Accuracy: {best_model[1]['Test Accuracy']:.4f}")

    # Save predictions for the best model
    best_model_name = best_model[0]
    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', models[best_model_name])
    ])
    best_pipeline.fit(X_train, y_train)
    final_predictions = best_pipeline.predict(X_test)

    # Convert numerical predictions back to fertilizer names
    final_predictions_labels = le.inverse_transform(final_predictions)
    y_test_labels = le.inverse_transform(y_test)

    comparison_df = pd.DataFrame({
        'Actual': y_test_labels,
        'Predicted': final_predictions_labels,
        'Correct': y_test_labels == final_predictions_labels
    })
    
    print("\nSample of Actual vs Predicted values:")
    print(comparison_df.head(10))

    comparison_df.to_csv('fertilizer_predictions.csv', index=False)
else:
    print("\nNo models were successfully trained! Check the error messages above.")


Training Logistic Regression...
Classification Report for Logistic Regression:
                    precision    recall  f1-score   support

        10/10/2010       1.00      1.00      1.00         2
        10/26/2026       1.00      0.70      0.82        10
          14-14-14       1.00      1.00      1.00         4
          14-35-14       0.79      1.00      0.88        11
          15-15-15       1.00      1.00      1.00         2
          17-17-17       1.00      1.00      1.00        11
             20-20       1.00      1.00      1.00        11
             28-28       1.00      1.00      1.00        14
               DAP       0.82      1.00      0.90        28
Potassium chloride       1.00      1.00      1.00         1
Potassium sulfate.       0.00      0.00      0.00         3
    Superphosphate       0.00      0.00      0.00         1
               TSP       0.67      0.50      0.57         8
              Urea       1.00      0.96      0.98        25

          accuracy