In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 



In [2]:
# Load the data
data = pd.read_csv("/Users/sneka/Downloads/extracted_data_with_contents_full.csv") 
data.isnull().sum()
df = data.dropna()

In [3]:
df.head()

Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...
12,P-12,https://en.everlight.com/wp-content/plugins/It...,No,"EVERLIGHT ELECTRONICS CO.,LTD. \n Everlight E..."
13,P-13,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights..."
14,P-14,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights..."


In [4]:


# Custom list of stopwords including common words and domain-specific terms
custom_stopwords = set(stopwords.words('english'))
custom_stopwords.update(["none", "some", "something", "another", "other", "more", "less", "many", "few"])  # Add additional terms as needed

def clean_text(text):
    # Check if the text is not NaN
    if pd.isnull(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs and email addresses
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Remove non-alphanumeric characters and numerical values
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    word_tokens = word_tokenize(text)
    
    # Remove stopwords and perform stemming
    stemmer = PorterStemmer()
    filtered_text = [stemmer.stem(word) for word in word_tokens if word not in custom_stopwords]
    
    # Join the stemmed words back into a single string
    text = ' '.join(filtered_text)
    
    return text

# Clean the text data
df['Cleaned_Content'] = df['Extracted_Content'].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cleaned_Content'] = df['Extracted_Content'].apply(clean_text)


In [5]:
df.head()

Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content,Cleaned_Content
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...,featur differ color beam configur avail alumin...
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...,recommend use architectur landscap gener exter...
12,P-12,https://en.everlight.com/wp-content/plugins/It...,No,"EVERLIGHT ELECTRONICS CO.,LTD. \n Everlight E...",everlight electron coltd everlight electron co...
13,P-13,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight right reserv releas date s...
14,P-14,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight right reserv releas date d...


In [6]:

# Split the data into training and testing sets
X = df['Cleaned_Content']
y = df['Is lighting product?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)



In [8]:
custom_stopwords_list = list(custom_stopwords)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000,stop_words=custom_stopwords_list) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [9]:
# Train the Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)



In [10]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)



In [11]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy Score: 0.918918918918919
Confusion Matrix:
[[71 10]
 [ 2 65]]
Classification Report:
              precision    recall  f1-score   support

          No       0.97      0.88      0.92        81
         Yes       0.87      0.97      0.92        67

    accuracy                           0.92       148
   macro avg       0.92      0.92      0.92       148
weighted avg       0.92      0.92      0.92       148



In [19]:
# Get class priors
class_priors = classifier.class_log_prior_

In [20]:
class_priors

array([-0.80457292, -0.59290177])

In [21]:
# Get class labels
class_labels = classifier.classes_


In [22]:
class_labels

array(['No', 'Yes'], dtype='<U3')

In [25]:
# Print class priors
print("Class Priors:")
for i, class_label in enumerate(class_labels):
    print("Class:", class_label, "- Log Prior:", class_priors[i])

# Print class-conditional probabilities
print("\nClass-Conditional Probabilities (Log Probabilities):")
for i, class_label in enumerate(class_labels):
    print("\nClass:", class_label)
    for j, word in enumerate(vocabulary):
        print("Word:", word, "- Log Probability:", class_conditional_probs[i, j])

Class Priors:
Class: No - Log Prior: -0.8045729157208621
Class: Yes - Log Prior: -0.5929017655688895

Class-Conditional Probabilities (Log Probabilities):

Class: No
Word: ac - Log Probability: 0.0038664473695547716
Word: accept - Log Probability: 0.0005945221309749394
Word: access - Log Probability: 0.0006656918322931958
Word: accessori - Log Probability: 0.0010311100727873473
Word: accord - Log Probability: 0.0004288845460778868
Word: acryl - Log Probability: 5.984026901887259e-05
Word: activ - Log Probability: 0.0006095032851321897
Word: ad - Log Probability: 0.0006262420732843451
Word: adalet - Log Probability: 0.0015409980240622307
Word: adapt - Log Probability: 0.00041979845902951485
Word: add - Log Probability: 0.0004353639569253046
Word: addit - Log Probability: 0.001375304522643668
Word: address - Log Probability: 0.0006477964536182992
Word: adjust - Log Probability: 0.0020667894416044824
Word: ah - Log Probability: 0.0011670264789358
Word: air - Log Probability: 0.00076868176

In [12]:
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()



In [16]:
feature_names

array(['ac', 'accept', 'access', 'accessori', 'accord', 'acryl', 'activ',
       'ad', 'adalet', 'adapt', 'add', 'addit', 'address', 'adjust', 'ah',
       'air', 'al', 'alarm', 'alimentacin', 'allow', 'also', 'aluminum',
       'alway', 'ambient', 'america', 'american', 'amp', 'anchor',
       'angel', 'angl', 'antiqu', 'app', 'appear', 'appli', 'applic',
       'appliqu', 'appropri', 'approv', 'architectur', 'area', 'arm',
       'arrir', 'arrow', 'asm', 'assembl', 'atex', 'attach', 'au',
       'author', 'auto', 'automat', 'avail', 'avec', 'averag', 'avoid',
       'awg', 'back', 'backup', 'ballast', 'bar', 'barrier', 'base',
       'batteri', 'bay', 'bb', 'beam', 'bl', 'black', 'blank', 'block',
       'blue', 'bluetooth', 'board', 'bodi', 'bolt', 'botier', 'bottom',
       'bouton', 'box', 'bracket', 'branch', 'brand', 'brass', 'break',
       'breaker', 'bright', 'bronz', 'brush', 'bs', 'bug', 'build',
       'builtin', 'bulb', 'button', 'buy', 'ca', 'cabinet', 'cabl',
       'ca

In [13]:
# Get class-wise feature counts
class_feature_counts = classifier.feature_count_



In [17]:
class_feature_counts

array([[6.17744487, 0.94987138, 1.06357961, ..., 0.        , 3.67566876,
        1.42869856],
       [2.73535096, 0.57402574, 0.94190997, ..., 1.39979803, 1.53044016,
        0.        ]])

In [14]:
# Calculate class-conditional probabilities
class_conditional_probs = class_feature_counts / class_feature_counts.sum(axis=1, keepdims=True)

In [18]:
class_conditional_probs

array([[0.00386645, 0.00059452, 0.00066569, ..., 0.        , 0.00230059,
        0.00089422],
       [0.00121531, 0.00025504, 0.00041849, ..., 0.00062193, 0.00067997,
        0.        ]])

In [15]:
# Print feature importance (class-conditional probabilities)
for i, class_label in enumerate(classifier.classes_):
    print("\nClass:", class_label)
    class_probs = class_conditional_probs[i]
    sorted_indices = class_probs.argsort()[::-1]  # Sort indices in descending order of probability
    for index in sorted_indices:
        print(feature_names[index], ":", class_probs[index])


Class: No
mm : 0.015591810818819916
control : 0.009473735569190182
de : 0.008830979158189317
sensor : 0.007183003710578399
cabl : 0.006419801120214298
system : 0.006388472481861242
wire : 0.006239018193510341
power : 0.005978724606114038
nvent : 0.005832358806248257
use : 0.005382431097565805
switch : 0.005123034609304896
product : 0.004644791896031002
standard : 0.004614651601405828
cm : 0.004505992243182656
load : 0.004429992303588792
vac : 0.004403243057474685
scale : 0.004300043617019953
steel : 0.0042903304266413604
instal : 0.004288009122277926
mount : 0.004146745018875017
number : 0.004060853923970197
output : 0.003956985463784719
specif : 0.0038791456747519068
ac : 0.0038664473695547716
connector : 0.0038235593913089373
oper : 0.0037379124417168943
input : 0.003698633149276065
light : 0.0036676881932190086
pole : 0.0036401574048883292
relay : 0.0035802370108909782
time : 0.00349700696023699
design : 0.0034516662694711576
rang : 0.0033493838408332477
suppli : 0.0033173283036291

CATEGORIES

In [28]:
import numpy as np

In [32]:
# Assuming class_conditional_probs contains the class-conditional probabilities
# Get the index of the "lighting product" class
lighting_index = np.where(class_labels == 'Yes')[0][0]

# Get the index of the "non-lighting product" class
non_lighting_index = np.where(class_labels == 'No')[0][0]

# Iterate over vocabulary and compare probabilities for each word
for i, word in enumerate(feature_names):
    prob_lighting = class_conditional_probs[lighting_index, i]
    prob_non_lighting = class_conditional_probs[non_lighting_index, i]
    
    # Print the word if its probability for one class is significantly higher than the other
    if prob_lighting > prob_non_lighting:
        print(f'Word "{word}" is indicative of lighting products.')
    elif prob_non_lighting > prob_lighting:
        print(f'Word "{word}" is indicative of non-lighting products.')




Word "ac" is indicative of non-lighting products.
Word "accept" is indicative of non-lighting products.
Word "access" is indicative of non-lighting products.
Word "accessori" is indicative of lighting products.
Word "accord" is indicative of lighting products.
Word "acryl" is indicative of lighting products.
Word "activ" is indicative of non-lighting products.
Word "ad" is indicative of non-lighting products.
Word "adalet" is indicative of non-lighting products.
Word "adapt" is indicative of non-lighting products.
Word "add" is indicative of non-lighting products.
Word "addit" is indicative of lighting products.
Word "address" is indicative of non-lighting products.
Word "adjust" is indicative of lighting products.
Word "ah" is indicative of non-lighting products.
Word "air" is indicative of non-lighting products.
Word "al" is indicative of non-lighting products.
Word "alarm" is indicative of non-lighting products.
Word "alimentacin" is indicative of non-lighting products.
Word "allow"

In [33]:
# Assuming class_conditional_probs contains the class-conditional probabilities
# Get the index of the "lighting product" class
lighting_index = np.where(class_labels == 'Yes')[0][0]

# Get the index of the "non-lighting product" class
non_lighting_index = np.where(class_labels == 'No')[0][0]

# Separate words for the "lighting product" category
lighting_words = []
# Separate words for the "non-lighting product" category
non_lighting_words = []

for i, word in enumerate(feature_names):
    prob_lighting = class_conditional_probs[lighting_index, i]
    prob_non_lighting = class_conditional_probs[non_lighting_index, i]
    
    # Check if the probability for lighting product is significantly higher than for non-lighting product
    if prob_lighting > prob_non_lighting:
        lighting_words.append(word)
    
    # Check if the probability for non-lighting product is significantly higher than for lighting product
    elif prob_non_lighting > prob_lighting:
        non_lighting_words.append(word)

print("Words indicative of lighting products:", lighting_words)
print("Words indicative of non-lighting products:", non_lighting_words)


Words indicative of lighting products: ['accessori', 'accord', 'acryl', 'addit', 'adjust', 'allow', 'aluminum', 'ambient', 'america', 'angel', 'angl', 'antiqu', 'app', 'appear', 'applic', 'appliqu', 'approv', 'architectur', 'arrow', 'attach', 'avail', 'averag', 'base', 'beam', 'bl', 'black', 'board', 'bodi', 'bottom', 'brass', 'bright', 'bronz', 'brush', 'bulb', 'ca', 'cablechain', 'california', 'canada', 'candela', 'canopi', 'cap', 'carton', 'cast', 'cc', 'cct', 'cdklm', 'ce', 'cec', 'ceil', 'center', 'certif', 'chain', 'chang', 'chip', 'cl', 'class', 'clean', 'clear', 'clearanc', 'clip', 'cm', 'coat', 'color', 'colour', 'combin', 'come', 'compani', 'compart', 'compat', 'complex', 'condit', 'consist', 'construct', 'consumpt', 'contain', 'cool', 'copper', 'copyright', 'cord', 'cordcabl', 'core', 'corner', 'corpor', 'cost', 'costa', 'cr', 'craft', 'creat', 'cri', 'criterion', 'cs', 'current', 'currentlightingcombeacon', 'currentlightingcomlitecontrol', 'currentlightingcomprescolit', 'cu