In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import os

In [23]:
df = pd.read_csv('/content/Data_change.csv')
df

Unnamed: 0.1,Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_x0020_Price,Max_x0020_Price,Modal_x0020_Price
0,0,Bihar,Araria,Forbesganj,Potato,Jyoti,FAQ,01-09-2024,2200,2700,2500.0
1,1,Bihar,Kishanganj,Bahadurganj,Potato,Jyoti,FAQ,01-09-2024,2500,2700,2600.0
2,2,Bihar,Madhubani,Jainagar,Onion,Medium,FAQ,01-09-2024,4200,4400,4300.0
3,3,Bihar,Rohtas,Natwar,Potato,Jyoti,FAQ,01-09-2024,2400,2800,2600.0
4,4,Chandigarh,Chandigarh,Chandigarh(Grain/Fruit),Ginger(Green),FAQ,FAQ,01-09-2024,3000,5000,4000.0
...,...,...,...,...,...,...,...,...,...,...,...
7934,7934,West Bengal,Purba Bardhaman,Kalna,Green Chilli,FAQ,FAQ,01-09-2024,7800,8200,8000.0
7935,7935,West Bengal,Purba Bardhaman,Kalna,Sweet Pumpkin,FAQ,FAQ,01-09-2024,2300,2500,2400.0
7936,7936,West Bengal,Purba Bardhaman,Memari,Brinjal,Brinjal,FAQ,01-09-2024,4700,5200,5000.0
7937,7937,West Bengal,Purba Bardhaman,Memari,Sweet Pumpkin,Sweet Pumpkin,FAQ,01-09-2024,2000,2200,2000.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# New Section

In [24]:
df['State'].value_counts().keys()

Index(['Tamil Nadu', 'Uttar Pradesh', 'Maharashtra', 'West Bengal',
       'Madhya Pradesh', 'Haryana', 'Punjab', 'Kerala', 'Telangana', 'Tripura',
       'Gujarat', 'Odisha', 'Himachal Pradesh', 'Rajasthan', 'Bihar',
       'Chandigarh', 'Uttrakhand', 'Karnataka', 'Chattisgarh',
       'Andhra Pradesh'],
      dtype='object', name='State')

In [25]:
list(df[df['State'] == 'Andhra Pradesh'].value_counts('Commodity').keys())

['Tomato']

In [26]:
df['Grade'] = df['Grade'].replace('FAQ', 'Other')
df.head(40)

Unnamed: 0.1,Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_x0020_Price,Max_x0020_Price,Modal_x0020_Price
0,0,Bihar,Araria,Forbesganj,Potato,Jyoti,Other,01-09-2024,2200,2700,2500.0
1,1,Bihar,Kishanganj,Bahadurganj,Potato,Jyoti,Other,01-09-2024,2500,2700,2600.0
2,2,Bihar,Madhubani,Jainagar,Onion,Medium,Other,01-09-2024,4200,4400,4300.0
3,3,Bihar,Rohtas,Natwar,Potato,Jyoti,Other,01-09-2024,2400,2800,2600.0
4,4,Chandigarh,Chandigarh,Chandigarh(Grain/Fruit),Ginger(Green),FAQ,Other,01-09-2024,3000,5000,4000.0
5,5,Gujarat,Amreli,Damnagar,Coriander(Leaves),Coriander,Other,01-09-2024,4200,5050,4750.0
6,6,Gujarat,Chhota Udaipur,Bodeliu,Cotton,Shanker 6 (B) 30mm FIne,Other,01-09-2024,6800,7000,6900.0
7,7,Gujarat,Rajkot,Gondal(Veg.market Gondal),Apple,Apple,Medium,01-09-2024,6000,13000,9500.0
8,8,Gujarat,Rajkot,Gondal(Veg.market Gondal),Cucumbar(Kheera),Cucumbar,Other,01-09-2024,2000,3500,2750.0
9,9,Gujarat,Rajkot,Gondal(Veg.market Gondal),Green Chilli,Green Chilly,Other,01-09-2024,2000,5000,3500.0


In [27]:
features = ['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade']
target_min = 'Min_x0020_Price'
target_max = 'Max_x0020_Price'

In [28]:
X = df[features]
y_min = df[target_min]
y_max = df[target_max]

In [29]:
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

In [30]:
X_train, X_test, y_min_train, y_min_test, y_max_train, y_max_test = train_test_split(X_encoded, y_min, y_max, test_size=0.2, random_state=42)

In [31]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=12)
}


for name, model in models.items():
    print(f"Training and evaluating {name}...")

    # Train the model
    model.fit(X_train, y_min_train)

    print("Training Score : ",model.score(X_train,y_min_train)*100)
    print("Testing Score : ",model.score(X_test,y_min_test)*100)

    print("Training Score : ",model.score(X_train,y_max_train)*100)
    print("Testing Score : ",model.score(X_test,y_max_test)*100)
    print()

Training and evaluating Linear Regression...
Training Score :  85.25133297834623
Testing Score :  74.0222856775041
Training Score :  82.28318963620484
Testing Score :  73.72600244954101

Training and evaluating Decision Tree...
Training Score :  100.0
Testing Score :  69.48286512494732
Training Score :  94.00070377751321
Testing Score :  70.97385136002524

Training and evaluating Random Forest...
Training Score :  96.99691074166655
Testing Score :  74.61964137175583
Training Score :  91.74079961035082
Testing Score :  75.30314812687102



In [32]:
best_model_name = 'Random Forest'
best_model = models[best_model_name]


new_data = pd.DataFrame({
    'State': ['West Bengal'],
    'District': ['Purba Bardhaman'],
    'Market': ['Memari'],
    'Commodity': ['Sweet Pumpkin'],
    'Variety': ['Sweet Pumpkin'],
    'Grade': ['Other'],
})

encoded_data = encoder.transform(new_data)

predicted_min_price = best_model.predict(encoded_data)
predicted_max_price = best_model.predict(encoded_data)

print(f'Predicted Min Price: {predicted_min_price[0]}')
print(f'Predicted Max Price: {predicted_max_price[0]}')

Predicted Min Price: 2156.0
Predicted Max Price: 2156.0


In [33]:
os.makedirs('models',exist_ok=True)
joblib.dump(model,'./models/RandomForest.lb')
joblib.dump(encoder,'./models/Encoder.lb')

['./models/Encoder.lb']

In [34]:
df.to_csv('Data_Change.csv')