In [11]:
# Cell 1: Import Required Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

print("All libraries imported successfully!")

All libraries imported successfully!


In [12]:
fruits = [
    'apple', 'banana', 'grapes', 'kiwi', 'mango', 'orange', 'pear', 'pineapple', 'pomegranate', 'watermelon', 
    'apricot', 'avocado', 'blackberry', 'blueberry', 'cherry', 'coconut', 'fig', 'lemon', 'lime', 'nectarine', 
    'papaya', 'peach', 'pear', 'plum', 'raspberry', 'strawberry', 'tangerine', 'apricot', 'dragonfruit', 'lychee', 
    'custardapple', 'melon', 'gooseberry', 'mandarin', 'jackfruit', 'persimmon', 'starfruit', 'pawpaw', 'elderberry', 
    'boysenberry', 'soursop', 'tamarillo', 'longan', 'durian', 'rambutan', 'mulberry', 'carambola', 'salak', 'clementine', 
    'zucchini', 'loquat', 'chayote', 'soursop'
]

In [13]:
vegetables = [
    'beetroot', 'bellpepper', 'cabbage', 'capsicum', 'carrot', 'cauliflower', 'chillpepper', 'corn', 'cucumber', 
    'eggplant', 'garlic', 'ginger', 'jalapeno', 'lemon', 'lettuce', 'onion', 'paprika', 'peas', 'potato', 'soybeans', 
    'spinach', 'sweetcorn', 'sweetpotato', 'tomato', 'turnip', 'radish', 'asparagus', 'artichoke', 'broccoli', 'celery', 
    'chard', 'parsnip', 'pumpkin', 'okra', 'squash', 'zucchini', 'fennel', 'kale', 'brusselsprout', 'bamboo shoots', 
    'bittermelon', 'chili', 'leek', 'chives', 'mustard greens', 'endive', 'watercress', 'cassava', 'coriander', 'turmeric', 
    'sorghum', 'sweet onion', 'shallot', 'yam', 'cabbage', 'arugula', 'butternut squash', 'radicchio', 'beet', 'chicory', 
    'kohlrabi', 'water chestnut', 'turnip greens', 'nopal', 'salsify', 'tarot'
]

In [15]:
texts = fruits + vegetables
labels = [0]*len(fruits) + [1]*len(vegetables)  # 0=fruit, 1=vegetable

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Print dataset info
print("\nTraining samples:")
for i in range(3):
    print(f"Text: {X_train[i]:<8} => {'vegetable' if y_train[i] else 'fruit'}")
print("\nTest samples:")
for i in range(3):
    print(f"Text: {X_test[i]:<8} => {'vegetable' if y_test[i] else 'fruit'}")



Training samples:
Text: rambutan => fruit
Text: blackberry => fruit
Text: coconut  => fruit

Test samples:
Text: capsicum => vegetable
Text: fennel   => vegetable
Text: mango    => fruit


In [16]:
# Character-level tokenization
tokenizer = Tokenizer(char_level=True, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)  # Fit only on training data

# Sequence conversion and padding
max_length = max([len(word) for word in X_train])
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Vocabulary info
vocab_size = len(tokenizer.word_index) + 1  # +1 for OOV
print(f"\nVocabulary size: {vocab_size}")
print(f"Max sequence length: {max_length}")
print("\nExample preprocessing:")
print(f"Original: {X_train[0]}")
print(f"Sequence: {X_train_seq[0]}")
print(f"Padded: {X_train_padded[0]}")



Vocabulary size: 28
Max sequence length: 16

Example preprocessing:
Original: rambutan
Sequence: [4, 2, 15, 14, 13, 6, 2, 7]
Padded: [ 4  2 15 14 13  6  2  7  0  0  0  0  0  0  0  0]


In [17]:
embedding_dim = 16

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(64, 3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

print("\nModel summary:")
model.summary()


Model summary:




In [19]:
history = model.fit(X_train_padded, np.array(y_train),
                   epochs=100,
                   validation_split=0.1,
                   batch_size=8,
                   verbose=1)

Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9215 - loss: 0.4285 - val_accuracy: 0.7000 - val_loss: 0.6723
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9657 - loss: 0.3409 - val_accuracy: 0.7000 - val_loss: 0.6552
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9222 - loss: 0.3447 - val_accuracy: 0.7000 - val_loss: 0.7383
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9900 - loss: 0.2494 - val_accuracy: 0.7000 - val_loss: 0.7716
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9793 - loss: 0.2162 - val_accuracy: 0.7000 - val_loss: 0.8068
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9832 - loss: 0.1941 - val_accuracy: 0.7000 - val_loss: 0.8663
Epoch 7/100
[1m11/11[0m [32m━━━

In [20]:
print("\nModel evaluation:")
loss, accuracy = model.evaluate(X_test_padded, np.array(y_test))
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")



Model evaluation:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6667 - loss: 1.5204
Test Loss: 1.5204
Test Accuracy: 66.67%


In [21]:
def predict_category(word):
    sequence = tokenizer.texts_to_sequences([word])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    return 'vegetable' if prediction[0][0] > 0.5 else 'fruit'

test_words = ['banana', 'celery', 'mango', 'radish', 'unknownword']
print("\nPredictions:")
for word in test_words:
    print(f"{word:<12} => {predict_category(word)}")


Predictions:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
banana       => fruit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
celery       => fruit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
mango        => fruit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
radish       => vegetable
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
unknownword  => fruit


In [23]:
import pickle

model.save('fruit_veg_model.h5')

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('max_length.pickle', 'wb') as f:
    pickle.dump(max_length, f)

