In [1]:
import pandas as pd
import numpy as np
from collections import Counter

# Load dataset
data = pd.read_csv("D:/recomendation/preprocessed2.csv")

# Strip trailing spaces in Season column
data['Season'] = data['Season'].str.rstrip()

# Drop unnecessary columns
if 'Unnamed: 0' in data.columns:
    del data['Unnamed: 0']

# Convert DataFrame to list of lists for your custom tree
training_data = data.values.tolist()


In [3]:
header = ['State_Name', 'District_Name', 'Season', 'Crop']

def class_counts(Data):
    counts = {}
    for row in Data:
        label = row[-1]
        counts[label] = counts.get(label, 0) + 1
    return counts

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value
    def match(self, example):
        return example[self.column] == self.value
    def __repr__(self):
        return f"Is {header[self.column]} == {self.value}?"

def unique_vals(Data, col):
    return set([row[col] for row in Data])

def partition(Data, question):
    true_rows, false_rows = [], []
    for row in Data:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

def gini(Data):
    counts = class_counts(Data)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(Data))
        impurity -= prob_of_lbl**2
    return impurity

def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p*gini(left) - (1-p)*gini(right)

def find_best_split(Data):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(Data)
    n_features = len(Data[0]) - 1
    for col in range(n_features):
        values = unique_vals(Data, col)
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(Data, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain > best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question

class Leaf:
    def __init__(self, Data):
        self.predictions = class_counts(Data)

class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(Data):
    gain, question = find_best_split(Data)
    if gain == 0:
        return Leaf(Data)
    true_rows, false_rows = partition(Data, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values())*1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl]/total * 100)) + "%"
    return probs


In [5]:
dt_model_final = build_tree(training_data)


In [7]:
# Example input
state = "Bihar"
district = "Patna"
season = "Kharif"

testing_data = [[state, district, season]]

for row in testing_data:
    Predict_dict = print_leaf(classify(row, dt_model_final))

# Display predictions
for key, value in Predict_dict.items():
    print(key, ":", value)


Arhar/Tur : 7%
Jute : 9%
Maize : 4%
Mesta : 9%
Other Kharif pulses : 5%
Sesamum : 8%
Small millets : 5%
Blackgram : 0%
Niger seed : 0%
Ragi : 8%
Sunflower : 4%
Dry ginger : 2%
Horse-gram : 6%
Jowar : 1%
Moong(Green Gram) : 8%
Rice : 2%
Urad : 8%
Groundnut : 3%
Sannhamp : 2%
Potato : 0%
Sweet potato : 0%


In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle # <-- 1. Import the pickle library
import os

# --- Setup ---
# Create a models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# --- Data Loading ---
try:
    data = pd.read_csv("D:/recomendation/preprocessed2.csv")
    print("Successfully loaded preprocessed2.csv")
except FileNotFoundError:
    print("Error: 'preprocessed2.csv' not found. Please place it in the same directory.")
    exit()

# --- Data Preparation ---
# Strip trailing spaces in Season column
data['Season'] = data['Season'].str.rstrip()

# Drop unnecessary columns
if 'Unnamed: 0' in data.columns:
    del data['Unnamed: 0']

# Convert DataFrame to list of lists for your custom tree
training_data = data.values.tolist()
header = ['State_Name', 'District_Name', 'Season', 'Crop']


# --- Decision Tree Classes and Functions (Your Code) ---

def class_counts(Data):
    counts = {}
    for row in Data:
        label = row[-1]
        counts[label] = counts.get(label, 0) + 1
    return counts

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value
    def match(self, example):
        return example[self.column] == self.value
    def __repr__(self):
        return f"Is {header[self.column]} == {self.value}?"

def unique_vals(Data, col):
    return set([row[col] for row in Data])

def partition(Data, question):
    true_rows, false_rows = [], []
    for row in Data:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

def gini(Data):
    counts = class_counts(Data)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(Data))
        impurity -= prob_of_lbl**2
    return impurity

def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p*gini(left) - (1-p)*gini(right)

def find_best_split(Data):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(Data)
    n_features = len(Data[0]) - 1
    for col in range(n_features):
        values = unique_vals(Data, col)
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(Data, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain > best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question

class Leaf:
    def __init__(self, Data):
        self.predictions = class_counts(Data)

class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(Data):
    gain, question = find_best_split(Data)
    if gain == 0:
        return Leaf(Data)
    true_rows, false_rows = partition(Data, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def print_leaf(counts):
    total = sum(counts.values())*1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl]/total * 100)) + "%"
    return probs


# --- Main Execution: Build, Save, Load, and Predict ---
if __name__ == '__main__':
    
    # 1. Build the tree from your training data
    print("Building the decision tree...")
    dt_model_final = build_tree(training_data)
    print("Tree built successfully.")

    # 2. Save the trained tree object using pickle
    model_filename = 'models/custom_decision_tree.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(dt_model_final, file)
    print(f"Custom tree model saved to {model_filename}")

    # 3. Load the model back from the file (to test it)
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print("Model loaded successfully from file.")

    # 4. Use the LOADED model to make a prediction
    print("\n--- Making a prediction with the loaded model ---")
    state = "Bihar"
    district = "Patna"
    season = "Kharif"
    testing_row = [state, district, season]
    
    # Use the loaded_model variable here
    prediction_dict = print_leaf(classify(testing_row, loaded_model))

    # Display predictions
    print(f"Prediction for input: {testing_row}")
    for key, value in prediction_dict.items():
        print(f"  {key}: {value}")


Successfully loaded preprocessed2.csv
Building the decision tree...
Tree built successfully.
Custom tree model saved to models/custom_decision_tree.pkl
Model loaded successfully from file.

--- Making a prediction with the loaded model ---
Prediction for input: ['Bihar', 'Patna', 'Kharif']
  Arhar/Tur: 7%
  Jute: 9%
  Maize: 4%
  Mesta: 9%
  Other Kharif pulses: 5%
  Sesamum: 8%
  Small millets: 5%
  Blackgram: 0%
  Niger seed: 0%
  Ragi: 8%
  Sunflower: 4%
  Dry ginger: 2%
  Horse-gram: 6%
  Jowar: 1%
  Moong(Green Gram): 8%
  Rice: 2%
  Urad: 8%
  Groundnut: 3%
  Sannhamp: 2%
  Potato: 0%
  Sweet potato: 0%
