<a href="https://colab.research.google.com/github/SunbalAzizLCWU/BSSE-DS-Project/blob/main/SunbalW11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
print("--- Step 1: Creating a Synthetic Text Dataset ---")
# Since we don't have text data, we create a small dataset manually
# to demonstrate NLP competence as required by the syllabus.

--- Step 1: Creating a Synthetic Text Dataset ---


In [3]:
data = [
    # Cardboard
    ("brown cardboard box", "cardboard"),
    ("corrugated packaging box", "cardboard"),
    ("heavy duty shipping box", "cardboard"),
    ("pizza box with grease", "cardboard"),
    ("brown paper carton", "cardboard"),

    # Glass
    ("clear glass bottle", "glass"),
    ("green wine bottle", "glass"),
    ("broken window pane", "glass"),
    ("glass jar of jam", "glass"),
    ("transparent beer bottle", "glass"),

    # Metal
    ("aluminum soda can", "metal"),
    ("rusty iron pipe", "metal"),
    ("tin food can", "metal"),
    ("steel cutlery spoon", "metal"),
    ("copper wire scrap", "metal"),

    # Paper
    ("white office paper", "paper"),
    ("newspaper daily edition", "paper"),
    ("glossy magazine pages", "paper"),
    ("shredded documents", "paper"),
    ("paperback book", "paper"),

    # Plastic
    ("clear plastic water bottle", "plastic"),
    ("plastic grocery bag", "plastic"),
    ("yogurt container cup", "plastic"),
    ("plastic straw and lid", "plastic"),
    ("soda bottle cap", "plastic"),

    # Trash (Misc)
    ("dirty food wrapper", "trash"),
    ("mixed medical waste", "trash"),
    ("used cigarette butt", "trash"),
    ("diapers and hygiene", "trash"),
    ("ceramics and pottery", "trash")
]

# Multiply dataset to simulate 'training' volume
data = data * 5

df = pd.DataFrame(data, columns=['text', 'label'])
print(f"âœ… Created text dataset with {len(df)} samples.")
print(df.head())

print("\n--- Step 2: NLP Preprocessing (Tokenization & Vectorization) ---")

âœ… Created text dataset with 150 samples.
                       text      label
0       brown cardboard box  cardboard
1  corrugated packaging box  cardboard
2   heavy duty shipping box  cardboard
3     pizza box with grease  cardboard
4        brown paper carton  cardboard

--- Step 2: NLP Preprocessing (Tokenization & Vectorization) ---


In [4]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.3, random_state=42
)

# Build an NLP Pipeline
# 1. CountVectorizer: Converts text to a matrix of token counts (Tokenization)
# 2. TfidfTransformer: Normalizes counts to "Term Frequency-Inverse Document Frequency"
# 3. MultinomialNB: Naive Bayes classifier (Standard for text)
nlp_pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')), # Removes "the", "a", "is"
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

print("Training NLP Model...")
nlp_pipeline.fit(X_train, y_train)

Training NLP Model...


In [5]:
print("\n--- Step 3: Evaluation ---")
y_pred = nlp_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"ðŸŽ‰ NLP Model Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Step 3: Evaluation ---
ðŸŽ‰ NLP Model Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

   cardboard       1.00      1.00      1.00         5
       glass       1.00      1.00      1.00         7
       metal       1.00      1.00      1.00         8
       paper       1.00      1.00      1.00         8
     plastic       1.00      1.00      1.00         7
       trash       1.00      1.00      1.00        10

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [6]:
print("\n--- Step 4: Live Test ---")
test_phrases = ["I have a coke can", "a pile of newspapers", "broken beer bottle"]
predictions = nlp_pipeline.predict(test_phrases)

for phrase, pred in zip(test_phrases, predictions):
    print(f"Input: '{phrase}' --> Predicted: {pred.upper()}")


--- Step 4: Live Test ---
Input: 'I have a coke can' --> Predicted: CARDBOARD
Input: 'a pile of newspapers' --> Predicted: CARDBOARD
Input: 'broken beer bottle' --> Predicted: GLASS
