In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

data = pd.read_csv('../data/soil_data.csv')  

print(data.columns)

features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
X = data[features]
y = data['label']  # Soil type names should be in the 'label' column

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Display predicted soil types
print("Predicted Soil Types:")
print(pd.Series(y_pred).value_counts())

Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
Accuracy: 0.9931818181818182
Predicted Soil Types:
coconut        27
chickpea       26
jute           25
papaya         23
apple          23
pigeonpeas     23
mothbeans      23
pomegranate    23
maize          21
banana         21
blackgram      20
kidneybeans    20
watermelon     19
mungbean       19
mango          19
coffee         17
rice           17
cotton         17
muskmelon      17
orange         14
grapes         14
lentil         12
Name: count, dtype: int64


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load the data
data = pd.read_csv('../data/soil_data.csv')  

# Print columns to check
print(data.columns)

# Define features and target
features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
X = data[features]
y = data['label']  # Soil type names should be in the 'label' column

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Display predicted soil types
print("Predicted Soil Types:")
print(pd.Series(y_pred).value_counts())

# Save the model to a .pkl file
joblib.dump(model, 'soil_classifier_model.pkl')


Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
Accuracy: 0.9931818181818182
Predicted Soil Types:
coconut        27
chickpea       26
jute           25
papaya         23
apple          23
pigeonpeas     23
mothbeans      23
pomegranate    23
maize          21
banana         21
blackgram      20
kidneybeans    20
watermelon     19
mungbean       19
mango          19
coffee         17
rice           17
cotton         17
muskmelon      17
orange         14
grapes         14
lentil         12
Name: count, dtype: int64


['soil_classifier_model.pkl']