In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib

# Load data from CSV
df = pd.read_csv('../data/crop_price_data.csv')

# Print the first few rows to inspect the data
print(df.head())

# Check for missing values and handle them if necessary
df = df.dropna()  # or use other methods to handle missing values

# Print the column names and data types
print(df.dtypes)

# Ensure 'date' column exists and inspect its values
if 'date' in df.columns:
    # Convert 'date' to datetime and handle errors
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Check for any rows that could not be converted
    if df['date'].isnull().any():
        print("Some dates could not be converted. These rows will be dropped.")
        df = df.dropna(subset=['date'])  # Drop rows where 'date' conversion failed

    # Extract features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
else:
    raise KeyError("'date' column is missing from the data")

# Features and Labels
X = df[['year', 'month', 'min_price', 'max_price']]
y = df['modal_price']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and Train Model
model = XGBRegressor(objective='reg:squarederror')
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred, squared=False))

# Function for Price Prediction
def predict_price(year, month, min_price, max_price):
    input_data = pd.DataFrame([[year, month, min_price, max_price]],
                              columns=['year', 'month', 'min_price', 'max_price'])
    input_data_scaled = scaler.transform(input_data)
    predicted_price = model.predict(input_data_scaled)
    return predicted_price[0]

# Save model and scaler to .pkl files
joblib.dump(model, 'xgb_regressor_model.pkl')
joblib.dump(scaler, 'standard_scaler.pkl')

# Example Usage
print("Predicted Price:", predict_price(2024, 2, 1100, 1400))


  commodity_name    state    district    market  min_price  max_price  \
0          Ajwan  Gujarat      Amreli    Amreli      63.25      79.55   
1          Ajwan  Gujarat  Banaskanth   Dhanera      80.05      80.05   
2          Ajwan  Gujarat    Jamnagar  Jamnagar      90.00     155.00   
3          Ajwan  Gujarat     Mehsana      Kadi      94.50     112.50   
4          Ajwan  Gujarat     Mehsana     Unjha      21.25     148.50   

   modal_price        date  
0        79.55  2019-05-22  
1        80.05  2019-05-22  
2       122.50  2019-05-22  
3       107.00  2019-05-22  
4        99.75  2019-05-22  
commodity_name     object
state              object
district           object
market             object
min_price         float64
max_price         float64
modal_price       float64
date               object
dtype: object
Some dates could not be converted. These rows will be dropped.
Mean Absolute Error: 1.37893174806677
Mean Squared Error: 494.8394067854872
Root Mean Squared Error: 2



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load the data
data = pd.read_csv('../data/soil_data.csv')  

# Print columns to check
print(data.columns)

# Define features and target
features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
X = data[features]
y = data['label']  # Soil type names should be in the 'label' column

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Display predicted soil types
print("Predicted Soil Types:")
print(pd.Series(y_pred).value_counts())

# Save the model to a .pkl file
joblib.dump(model, 'soil_classifier_model.pkl')


Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
Accuracy: 0.9931818181818182
Predicted Soil Types:
coconut        27
chickpea       26
jute           25
papaya         23
apple          23
pigeonpeas     23
mothbeans      23
pomegranate    23
maize          21
banana         21
blackgram      20
kidneybeans    20
watermelon     19
mungbean       19
mango          19
coffee         17
rice           17
cotton         17
muskmelon      17
orange         14
grapes         14
lentil         12
Name: count, dtype: int64


['soil_classifier_model.pkl']