In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load your dataset (replace with your actual dataset)
# For demonstration, we'll create a synthetic dataset
# Example features: ['Temperature', 'Humidity', 'Soil pH', 'Nutrients']
data = {
    'Temperature': [30, 28, 25, 32, 22, 29, 27, 31, 26, 24],
    'Humidity': [70, 65, 60, 80, 50, 75, 68, 72, 64, 58],
    'Soil pH': [6.5, 6.2, 5.8, 6.8, 5.5, 6.1, 6.4, 6.3, 5.9, 5.7],
    'Nutrients': ['High', 'Medium', 'Low', 'High', 'Low', 'Medium', 'High', 'High', 'Medium', 'Low'],
    'Disease': ['Healthy', 'Diseased', 'Diseased', 'Healthy', 'Diseased', 'Healthy', 'Healthy', 'Diseased', 'Healthy', 'Diseased']
}
df = pd.DataFrame(data)

# Define features and target variable
X = df.drop('Disease', axis=1)
y = df['Disease']

# Preprocessing: Define column types
numeric_features = ['Temperature', 'Humidity', 'Soil pH']
categorical_features = ['Nutrients']

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)

# Train Logistic Regression model
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

lr_pipeline.fit(X_train, y_train)
lr_pred = lr_pipeline.predict(X_test)

# Evaluate both models
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    print(f'{model_name} Accuracy: {accuracy:.2f}')
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()
    return accuracy

rf_accuracy = evaluate_model(y_test, rf_pred, "Random Forest")
lr_accuracy = evaluate_model(y_test, lr_pred, "Logistic Regression")

# Select the best model based on accuracy
if rf_accuracy > lr_accuracy:
    best_model = rf_pipeline
    print("Random Forest is the best model.")
else:
    best_model = lr_pipeline
    print("Logistic Regression is the best model.")

# Function to predict new values
def predict_new_values(model, new_data):
    new_df = pd.DataFrame(new_data)
    prediction = model.predict(new_df)
    return prediction

# Example usage of the prediction function
new_values = {
    'Temperature': [27],
    'Humidity': [65],
    'Soil pH': [6.2],
    'Nutrients': ['Medium']
}

predicted_disease = predict_new_values(best_model, new_values)
print(f'The predicted disease condition is: {predicted_disease[0]}')

ValueError: Found unknown categories ['Medium'] in column 0 during transform