## Ener Grow AI MVP 

In [1]:
import pandas as pd
import numpy as np
import sklearn as sl
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv(r"C:\Users\USER\Downloads\EnerGrow_Nigeria_MVP_Dataset.csv")

In [3]:
print(df.shape)
print(df.columns)
print(df.head)

(100, 14)
Index(['Location', 'Avg Temp (°C)', 'Crop Type', 'Harvest Quantity (kg)',
       'Current Fuel', 'Fuel Consumption (kg)', 'Health Impact Score',
       'Post-Harvest Loss (%)', 'Income Level', 'Access to Financing',
       'Youth Involvement', 'Technology Adopted', 'Cost Saving (₦)',
       'CO2 Emission Reduction (kg)'],
      dtype='object')
<bound method NDFrame.head of    Location  Avg Temp (°C) Crop Type  Harvest Quantity (kg) Current Fuel  \
0      Ogun             29      Rice                   1630          LPG   
1      Kano             30   Cassava                   3985     Firewood   
2      Ogun             29      Rice                   2306     Charcoal   
3      Ogun             29      Rice                   2561          LPG   
4     Lagos             31     Maize                   3959     Firewood   
..      ...            ...       ...                    ...          ...   
95     Kano             30      Rice                   1694     Firewood   
96    

In [4]:


def augment_data(df, target_rows=1000):
    current_len = len(df)
    additional_rows_needed = target_rows - current_len
    
    new_rows = []
    for _ in range(additional_rows_needed):
        row = df.sample(n=1).iloc[0].copy()
        
        # Add noise/randomness to numeric columns for realism
        row['Harvest Quantity (kg)'] = max(500, int(row['Harvest Quantity (kg)'] * np.random.uniform(0.8, 1.2)))
        row['Fuel Consumption (kg)'] = max(0, int(row['Fuel Consumption (kg)'] * np.random.uniform(0.7, 1.3)))
        row['Health Impact Score'] = max(1, min(10, int(row['Health Impact Score'] + np.random.randint(-2, 3))))
        row['Post-Harvest Loss (%)'] = max(5, min(20, row['Post-Harvest Loss (%)'] * np.random.uniform(0.8, 1.2)))
        row['Cost Saving (₦)'] = max(0, int(row['Cost Saving (₦)'] * np.random.uniform(0.7, 1.3)))
        row['CO2 Emission Reduction (kg)'] = max(0, row['CO2 Emission Reduction (kg)'] * np.random.uniform(0.7, 1.3))
        
        new_rows.append(row)
    
    new_df = pd.DataFrame(new_rows)
    full_df = pd.concat([df, new_df], ignore_index=True)
    return full_df

augmented_df = augment_data(df, 1000)


In [5]:
augmented_df.tail()

Unnamed: 0,Location,Avg Temp (°C),Crop Type,Harvest Quantity (kg),Current Fuel,Fuel Consumption (kg),Health Impact Score,Post-Harvest Loss (%),Income Level,Access to Financing,Youth Involvement,Technology Adopted,Cost Saving (₦),CO2 Emission Reduction (kg)
995,Kano,30,Maize,1011,LPG,24,3,8.957685,Low,Yes,Yes,Efficient Cookstove,7119,6.730902
996,Enugu,27,Cassava,2560,Firewood,32,2,12.234556,High,No,No,Solar Dryer,8368,17.98444
997,Kano,30,Maize,1613,Charcoal,39,4,14.831075,Low,No,Yes,Efficient Cookstove,5995,8.03828
998,Enugu,27,Maize,2036,Firewood,36,3,16.477326,Low,No,No,Efficient Cookstove,1949,12.022819
999,Ogun,29,Maize,3905,Firewood,38,8,6.346523,Low,No,No,Solar Refrigeration,3578,18.329341


In [6]:


pip install scikit-learn==1.7.2

Note: you may need to restart the kernel to use updated packages.


In [7]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


# 2. Select ONLY the columns we need
features = [
    "Current Fuel",
    "Income Level",
    "Access to Financing",
    "Post-Harvest Loss (%)"
]

target = "Technology Adopted"

# 3. Remove rows with missing values
augmented_df= augmented_df.dropna(subset=features + [target])

X = augmented_df[features]
y = augmented_df[target]

# 4. Convert text columns to numbers automatically
categorical_features = [
    "Current Fuel",
    "Income Level",
    "Access to Financing"
]

numeric_features = ["Post-Harvest Loss (%)"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# 5. Very simple model
model = DecisionTreeClassifier(max_depth=4, random_state=42)

# 6. Combine preprocessing + model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 7. Train model
pipeline.fit(X, y)

# 8. Save trained model
joblib.dump(pipeline, "clean_energy_recommender_model.joblib")

print("✅ Model trained and saved successfully!")


✅ Model trained and saved successfully!
