In [1]:
import pandas as pd
data=pd.read_csv('recycling_dataset1.csv')
data.head()

Unnamed: 0,text,category
0,plastic bottle,recyclable
1,water bottle,recyclable
2,empty milk carton,recyclable
3,cardboard box,recyclable
4,shipping box,recyclable


In [2]:
data.shape

(329, 2)

In [3]:
#Check Dataset Size

print("Dataset Shape:", data.shape)
print("\nClass Distribution:\n")
print(data["category"].value_counts())

Dataset Shape: (329, 2)

Class Distribution:

category
recyclable    96
organic       81
trash         51
e-waste       50
hazardous     40
Name: count, dtype: int64


In [4]:
# Convert text column to string (important)
data["text"] = data["text"].astype(str)

# Remove rows where text is empty or just spaces
data = data[data["text"].str.strip() != ""]

# Remove rows where category is missing
data = data[data["category"].notna()]

# Reset index
data = data.reset_index(drop=True)

print("Cleaned dataset shape:", data.shape)

Cleaned dataset shape: (318, 2)


In [5]:
#Split Data (Train & Test)

from sklearn.model_selection import train_test_split

X = data["text"]
y = data["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
# Remove rows where text is missing
data = data.dropna(subset=["text"])

# Reset index (optional but clean)
data = data.reset_index(drop=True)

print("Missing values removed.")

Missing values removed.


In [7]:
#to check if there are any empty rows
print(data.isnull().sum())

text        0
category    0
dtype: int64


In [8]:
#Convert Text to Numbers (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1,3),  # include 3-word phrases
    stop_words='english',
    max_features=7000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("Text converted to numeric features.")

Text converted to numeric features.


In [9]:
#Train ML Model (Logistic Regression)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)
print("Model training completed.")

Model training completed.


In [10]:
#Evaluate Model

from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))




Classification Report:

              precision    recall  f1-score   support

     e-waste       1.00      0.60      0.75        10
   hazardous       0.80      0.50      0.62         8
     organic       0.79      0.94      0.86        16
  recyclable       0.68      0.95      0.79        20
       trash       1.00      0.60      0.75        10

    accuracy                           0.78        64
   macro avg       0.85      0.72      0.75        64
weighted avg       0.82      0.78      0.77        64


Accuracy: 0.78125


In [11]:
#Train ML Model (Naive Bayes model)

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

y_pred_nb = nb_model.predict(X_test_vec)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.703125


In [12]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm,
                     index=model.classes_,
                     columns=model.classes_)

print(cm_df)

            e-waste  hazardous  organic  recyclable  trash
e-waste           6          1        1           2      0
hazardous         0          4        1           3      0
organic           0          0       15           1      0
recyclable        0          0        1          19      0
trash             0          0        1           3      6


In [14]:

#Manual Testing

import numpy as np

# Convert dataset text to lowercase set for exact checking
dataset_text_set = set(data["text"].str.lower())

while True:
    user_input = input("Enter waste item (type exit to stop): ").strip().lower()

    if user_input == "exit":
        break

    # 1️⃣ Check if input is empty
    if user_input == "":
        print("Please enter a valid waste item.")
        continue

    # 2️⃣ Exact match check (if it exists in dataset)
    if user_input in dataset_text_set:
        input_vec = vectorizer.transform([user_input])
        prediction = model.predict(input_vec)
        print("Predicted category:", prediction[0])
        continue

    # 3️⃣ Check if words exist in TF-IDF vocabulary
    words = user_input.split()
    known_words = [word for word in words if word in vectorizer.vocabulary_]

    if len(known_words) == 0:
        print("Invalid input. Item not recognized in recycling database.")
        continue

    # 4️⃣ Use probability check
    input_vec = vectorizer.transform([user_input])
    probabilities = model.predict_proba(input_vec)
    max_prob = np.max(probabilities)
    prediction = model.predict(input_vec)

    if max_prob < 0.30:
        print("Invalid or unclear input. Please check spelling or item type.")
    else:
        print("Predicted category:", prediction[0])
        print("Confidence:", round(max_prob, 2))

Enter waste item (type exit to stop): bananaaaa
Invalid input. Item not recognized in recycling database.
Enter waste item (type exit to stop): glass perfume bottle
Predicted category: recyclable
Confidence: 0.57
Enter waste item (type exit to stop): exit


In [15]:
#To save files

import pickle

pickle.dump(model, open("recycling_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("Files saved successfully!")

Files saved successfully!
