In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv("/Users/sneka/Downloads/extracted_data_with_contents_full.csv")

print(data.shape)
data.head()

(1013, 4)


Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...
2,P-2,https://myelektralite.com/assets/1018-AI-Outdo...,Yes,
3,P-3,https://myelektralite.com/assets/1018-Data-She...,Yes,
4,P-4,https://myelektralite.com/assets/1018AI-Data-S...,Yes,


In [3]:
data.isnull().sum()

ID                       13
URL                      13
Is lighting product?     13
Extracted_Content       277
dtype: int64

In [4]:
df = data.dropna()

In [5]:
print(df.shape)
df.head()

(736, 4)


Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...
12,P-12,https://en.everlight.com/wp-content/plugins/It...,No,"EVERLIGHT ELECTRONICS CO.,LTD. \n Everlight E..."
13,P-13,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights..."
14,P-14,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights..."


In [6]:
df['Is lighting product?'].value_counts()

Is lighting product?
Yes    392
No     344
Name: count, dtype: int64

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
import re

In [8]:
# Custom list of stopwords including common words and domain-specific terms
custom_stopwords = set(stopwords.words('english'))
custom_stopwords.update(["none", "some", "something", "another", "other", "more", "less", "many", "few", "and", "the", "for", "is", "to", "with"])  # Add additional terms as needed


In [9]:
def clean_text(text):
    # Check if the text is not NaN
    if pd.isnull(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs and email addresses
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Remove non-alphanumeric characters and numerical values
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove stopwords
    #stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in custom_stopwords]
    text = ' '.join(filtered_text)

    # Stemming
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    text = ' '.join(stemmed_text)
    return text


df['Cleaned_Content'] = df['Extracted_Content'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cleaned_Content'] = df['Extracted_Content'].apply(clean_text)


In [10]:
df.head()

Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content,Cleaned_Content
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...,featur differ color and beam configur avail al...
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...,recommend use architectur landscap gener exter...
12,P-12,https://en.everlight.com/wp-content/plugins/It...,No,"EVERLIGHT ELECTRONICS CO.,LTD. \n Everlight E...",everlight electron coltd everlight electron co...
13,P-13,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight all right reserv releas da...
14,P-14,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight all right reserv releas da...


In [11]:
#Mapping the values

df['target'] = df['Is lighting product?'].map({
    'Yes' : 1,
    'No'  : 0
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['Is lighting product?'].map({


In [12]:
data_vector = TfidfVectorizer()
tranformed_output = data_vector.fit_transform(df['Cleaned_Content'])
print(data_vector.vocabulary_)

{'featur': 6226, 'differ': 4485, 'color': 2995, 'and': 652, 'beam': 1487, 'configur': 3245, 'avail': 1243, 'aluminum': 553, 'alloy': 485, 'hous': 7554, 'high': 7406, 'heat': 7333, 'dissip': 4647, 'perform': 11876, 'univers': 16924, 'power': 12343, 'suppli': 15726, 'easi': 5061, 'instal': 8108, 'year': 18083, 'warranti': 17607, 'electr': 5279, 'input': 8063, 'vac': 17112, 'etl': 5847, 'list': 9253, 'decor': 4083, 'led': 8961, 'dual': 4944, 'light': 9172, 'usa': 17035, 'llc': 9294, 'phone': 11986, 'ailabl': 363, 'blue': 1768, 'green': 7099, 'amber': 567, 'warm': 17599, 'white': 17762, 'cool': 3467, 'red': 13393, 'purpl': 12867, 'photometr': 11999, 'hm': 7466, 'medlux': 9952, 'maxlux': 9902, 'recommend': 13359, 'use': 17048, 'architectur': 877, 'landscap': 8762, 'gener': 6881, 'exterior': 6045, 'of': 11251, 'hotel': 7548, 'sign': 14814, 'flag': 6418, 'multiunit': 10601, 'residenti': 13707, 'commerci': 3068, 'govern': 7045, 'build': 2009, 'up': 16997, 'dusqhw': 4995, 'detail': 4329, 'sourc

In [13]:
data_vector

In [14]:
all_feature_names = data_vector.get_feature_names_out()

for word in all_feature_names:
    indx = data_vector.vocabulary_.get(word)
    print(f"{word} {data_vector.idf_[indx]}")

aa 5.993149979755236
aaac 6.909440711629391
aaad 6.909440711629391
aara 6.503975603521227
aashto 5.523146350509501
aashtoabnt 5.523146350509501
aau 6.503975603521227
aauadjust 6.909440711629391
ab 4.0190689537332265
abaixo 6.503975603521227
abajo 5.523146350509501
abatjour 6.216293531069446
abbiamo 6.909440711629391
abc 5.656677743134023
abccl 6.909440711629391
abcd 6.216293531069446
abdeckt 6.909440711629391
abdeckung 6.909440711629391
abertura 5.993149979755236
abfal 6.909440711629391
abfllen 6.909440711629391
abga 6.909440711629391
abgasstrmen 6.909440711629391
abgasstrom 6.909440711629391
abgeleitet 6.909440711629391
abgeschliffen 6.909440711629391
abgeschlossenen 6.909440711629391
abgestimmten 6.909440711629391
abgetragen 6.909440711629391
abhngig 6.909440711629391
abierta 6.909440711629391
abierto 6.909440711629391
abil 4.076227367573175
abl 4.658148913022896
ablagern 6.909440711629391
ablagerungen 6.909440711629391
abloy 6.909440711629391
abm 6.503975603521227
abment 6.909440711

In [15]:
# tranformed_output.to_array()['Extracted_Content']

In [16]:
# After fitting the TF-IDF vectorizer to your data
tfidf_matrix = data_vector.fit_transform(df['Cleaned_Content'])

# Convert the sparse matrix to a dense matrix for better readability
dense_matrix = tfidf_matrix.toarray()

# Convert the dense matrix to a DataFrame with column names
tfidf_df = pd.DataFrame(dense_matrix, columns=data_vector.get_feature_names_out())

# Print the DataFrame
#print(tfidf_df)

tfidf_matrix


<736x18203 sparse matrix of type '<class 'numpy.float64'>'
	with 149867 stored elements in Compressed Sparse Row format>

In [17]:
# Get the learned vocabulary and IDF scores from the TF-IDF vectorizer
vocabulary = data_vector.get_feature_names_out()
idf_scores = data_vector.idf_

# Create a DataFrame to store the features and their IDF scores
idf_df = pd.DataFrame({'Term': vocabulary, 'IDF Score': idf_scores})

# Sort the DataFrame by IDF scores to see the most important features
idf_df_sorted = idf_df.sort_values(by='IDF Score', ascending=False)

# Print the top features
print("Top features used by TF-IDF for categorization:")
print(idf_df_sorted.head(10))


Top features used by TF-IDF for categorization:
                    Term  IDF Score
9101           lhlirsgwh   6.909441
14177              rvler   6.909441
6589             fornisc   6.909441
6590              fornit   6.909441
11278             ofiici   6.909441
6592            fornitor   6.909441
11277            oficina   6.909441
6594           forspecif   6.909441
6596     fortgeschritten   6.909441
6598   fortschrittlichen   6.909441


In [18]:
# Separate the data into lighting and non-lighting products
lighting_data = df[df['target'] == 1]['Cleaned_Content']
non_lighting_data = df[df['target'] == 0]['Cleaned_Content']

# Fit separate TF-IDF vectorizers for lighting and non-lighting products
lighting_vectorizer = TfidfVectorizer()
non_lighting_vectorizer = TfidfVectorizer()

lighting_tfidf_matrix = lighting_vectorizer.fit_transform(lighting_data)
non_lighting_tfidf_matrix = non_lighting_vectorizer.fit_transform(non_lighting_data)

# Get the vocabulary and IDF scores for lighting and non-lighting products
lighting_vocabulary = lighting_vectorizer.get_feature_names_out()
lighting_idf_scores = lighting_vectorizer.idf_

non_lighting_vocabulary = non_lighting_vectorizer.get_feature_names_out()
non_lighting_idf_scores = non_lighting_vectorizer.idf_

# Create DataFrames to store the features and their IDF scores for lighting and non-lighting products
lighting_idf_df = pd.DataFrame({'Term': lighting_vocabulary, 'IDF Score (Lighting)': lighting_idf_scores})
non_lighting_idf_df = pd.DataFrame({'Term': non_lighting_vocabulary, 'IDF Score (Non-Lighting)': non_lighting_idf_scores})

# Sort the DataFrames by IDF scores to see the most important features
lighting_idf_df_sorted = lighting_idf_df.sort_values(by='IDF Score (Lighting)', ascending=False)
non_lighting_idf_df_sorted = non_lighting_idf_df.sort_values(by='IDF Score (Non-Lighting)', ascending=False)

# Print the top features for lighting and non-lighting products
print("Top features for lighting products:")
print(lighting_idf_df_sorted.tail(50))

print("\nTop features for non-lighting products:")
print(non_lighting_idf_df_sorted.tail(50))


Top features for lighting products:
          Term  IDF Score (Lighting)
2412    fixtur              1.867864
257     applic              1.867864
6694  warranti              1.861822
334         at              1.861822
6678      wall              1.855816
4250       not              1.855816
5049      rate              1.849846
3588     locat              1.843911
1010     chang              1.838011
1434       cri              1.838011
6509       use              1.832146
3548      list              1.832146
507         be              1.792026
3716     lumen              1.786424
6818   without              1.775313
6899      year              1.775313
3024    includ              1.758874
3332      lamp              1.758874
6398      type              1.758874
585      black              1.753454
4431    option              1.748063
5867  standard              1.742701
3054    inform              1.742701
726         by              1.726786
4390        on              1.716314
12

In [19]:
df.head()

Unnamed: 0,ID,URL,Is lighting product?,Extracted_Content,Cleaned_Content,target
0,P-0,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,FEATURES\n•Different color and beam configurat...,featur differ color and beam configur avail al...,1
1,P-1,https://a4b6eb24-cef6-4be0-88b1-aa225dd7e4f7.u...,Yes,Recommended Use:\n•Architectural\n•Landscaping...,recommend use architectur landscap gener exter...,1
12,P-12,https://en.everlight.com/wp-content/plugins/It...,No,"EVERLIGHT ELECTRONICS CO.,LTD. \n Everlight E...",everlight electron coltd everlight electron co...,0
13,P-13,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight all right reserv releas da...,1
14,P-14,https://en.everlight.com/wp-content/plugins/It...,Yes,"\n1 \nCopyright © 2010, Everlight All Rights...",copyright everlight all right reserv releas da...,1


In [20]:
X = df['Cleaned_Content']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=df['target'])


In [21]:
print(X_train.shape)

(588,)


In [22]:
print(X_test.shape)

(148,)


In [23]:
y_train.value_counts()

target
1    313
0    275
Name: count, dtype: int64

In [24]:
y_test.value_counts()

target
1    79
0    69
Name: count, dtype: int64

In [25]:
from sklearn.neighbors import KNeighborsClassifier

clf_Knn = Pipeline([
('vectorizer_tfidf', TfidfVectorizer()),
('KNN', KNeighborsClassifier())
])

clf_Knn.fit(X_train, y_train)

y_pred = clf_Knn.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96        69
           1       0.96      0.97      0.97        79

    accuracy                           0.97       148
   macro avg       0.97      0.97      0.97       148
weighted avg       0.97      0.97      0.97       148



In [26]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[66  3]
 [ 2 77]]


TRAIN DATA

In [27]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions on the training data
y_train_pred = clf_Knn.predict(X_train)

# Print classification report for training data
print("Classification Report for Training Data:")
print(classification_report(y_train, y_train_pred))

# Calculate accuracy for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
print("Accuracy on Training Data:", accuracy_train)

# Generate confusion matrix for training data
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix for Training Data:")
print(conf_matrix_train)


Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95       275
           1       0.94      0.98      0.96       313

    accuracy                           0.95       588
   macro avg       0.95      0.95      0.95       588
weighted avg       0.95      0.95      0.95       588

Accuracy on Training Data: 0.9523809523809523
Confusion Matrix for Training Data:
[[254  21]
 [  7 306]]


MISSCLASSIFIED CASES

In [28]:
# # Print misclassified examples for test data
# print("Misclassified examples in test data:")
# for idx, (pred_label, actual_label) in enumerate(zip(y_pred, y_test)):
#     if pred_label != actual_label:
#         print(f"Index: {idx}, Predicted: {pred_label}, Actual: {actual_label}")

# # Print misclassified examples for training data
# print("Misclassified examples in training data:")
# for idx, (pred_label, actual_label) in enumerate(zip(y_train_pred, y_train)):
#     if pred_label != actual_label:
#         print(f"Index: {idx}, Predicted: {pred_label}, Actual: {actual_label}")


In [29]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for training data
train_results = pd.DataFrame({'ID': X_train.index,
                              'URL': df.loc[X_train.index, 'URL'],
                              'Actual': y_train,
                              'Predicted': y_train_pred})

# Find misclassifications in the training data
train_misclassifications = train_results[train_results['Actual'] != train_results['Predicted']]

# Print misclassifications
print("Misclassifications in the training data:")
print(train_misclassifications)


Misclassifications in the training data:
      ID                                                URL  Actual  Predicted
770  770  https://cdn.currentlighting.com/site/specsheet...       1          0
104  104  https://www.houseofantiquehardware.com/Product...       0          1
365  365  http://www.solidapollo.com/PDF/3528-24W-LED-St...       1          0
124  124  https://www.sstlighting.com/CatalogPages/Drive...       0          1
313  313  https://www.mullanlighting.com/us/mst_attachme...       0          1
538  538  https://prod-edam.honeywell.com/content/dam/ho...       1          0
779  779  https://cdn.currentlighting.com/site/specsheet...       0          1
927  927  https://matthewsfanco.com/wp-content/uploads/2...       0          1
921  921  https://s3.amazonaws.com/files.siemon.com/int-...       0          1
931  931  https://matthewsfanco.com/wp-content/uploads/B...       0          1
326  326  https://store-43185.mybigcommerce.com/content/...       1          0
55    55  h

In [30]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for test data
test_results = pd.DataFrame({'ID': X_test.index,
                             'URL': df.loc[X_test.index, 'URL'],
                             'Actual': y_test,
                             'Predicted': y_pred})

# Find misclassifications in the test data
test_misclassifications = test_results[test_results['Actual'] != test_results['Predicted']]

# Print misclassifications
print("Misclassifications in the test data:")
print(test_misclassifications)


Misclassifications in the test data:
      ID                                                URL  Actual  Predicted
819  819  https://cdn.currentlighting.com/site/specsheet...       1          0
111  111  https://www.houseofantiquehardware.com/Product...       0          1
428  428  https://wp4d77.p3cdn1.secureserver.net/wp-cont...       0          1
833  833  https://www.pennconduit.com/wp-content/uploads...       0          1
116  116    http://primolanterns.com/img/Drawings/AT-32.pdf       1          0


In [31]:
X_test[:5]

819    switchabl lumen and cct technolog maxim invent...
823    srt edgelit ceilingsurfacegarag currentlightin...
804    celem recess emerg light currentlightingcomcom...
970    ten standard design make up the cscx line of s...
677    omni low voltag ultrason ceil sensor ceil moun...
Name: Cleaned_Content, dtype: object

In [32]:
X_test[:5][823]

'srt edgelit ceilingsurfacegarag currentlightingcombeacon hli solut inc all right reserv inform and specif subject to chang without notic all valu are design or typic valu when measur under laboratori condit page of rev beasrtedgelitr featur for ceil mount and park garag applic from an foot mount height edgelit flat len for optim visual comfort and uniform across the len two optic distribut specif design for park garag and canopi applic are avail make the beacon edgelit luminair both versatil and function ulcul list for wet locat ip and g vibrat rate occup sensor avail for complet onoff and dim oper date locat type project hous diecast aluminum hous ensur long electr compon life and luminair perform corros resist powder coat finish both protect and provid architectur appear one piec mold silicon gasket ensur weather proof seal thermal isol driver mount to dedic bracket reduc oper temperatur and increas driver life and reliabl torx head screw standard for tamper resist hous optic edgeli

In [33]:
y_test[:148]

819    1
823    1
804    1
970    0
677    0
      ..
698    0
257    1
794    1
993    0
182    0
Name: target, Length: 148, dtype: int64

In [34]:
y_pred[:148]

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0])

In [35]:
message = "ten standard design make up the cscx line of s"
result = clf_Knn.predict([message])
print("Result: ", result[0])


Result:  0


In [36]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.neighbors import KNeighborsClassifier

# # Define parameter grid
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],  # Different values of k
#     'weights': ['uniform', 'distance'],  # Weighting scheme for neighbors
#     'metric': ['euclidean', 'manhattan', 'cosine']  # Different distance metrics
# }

# # Initialize KNN classifier
# knn = KNeighborsClassifier()

# # Perform grid search cross-validation
# grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Get best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters:", best_params)
# print("Best Score:", best_score)


NEW TEST DATA

In [37]:
# import pandas as pd

# # Load the new unseen dataset
# test_data = pd.read_csv("/Users/sneka/Downloads/extracted_data_with_contents_full_test.csv").dropna()


# # Apply the same text cleaning to the new dataset
# test_data['Cleaned_Content'] = test_data['Extracted_Content'].apply(clean_text)

# # Transform the cleaned text data using the TF-IDF vectorizer
# test_tfidf = data_vector.transform(test_data['Cleaned_Content'])

# # Make predictions using the trained model directly on the transformed data
# predictions = clf_Knn.predict(test_tfidf)

# # Calculate model accuracy on the test data
# accuracy = accuracy_score(test_data['target'], predictions)
# print("Model Accuracy on Test Data:", accuracy)

# # Generate classification report for the test data
# report = classification_report(test_data['target'], predictions)
# print("Classification Report for Test Data:")
# print(report)

# # Print predictions
# print("Predictions on Test Data:", predictions)


MULTINOMIAL NAIVE BAYES

In [38]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = Pipeline([
('vectorizer_tfidf', TfidfVectorizer()),
('MNN', MultinomialNB())
])

clf_nb.fit(X_train, y_train)

y_pred = clf_nb.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        69
           1       0.91      1.00      0.95        79

    accuracy                           0.95       148
   macro avg       0.95      0.94      0.95       148
weighted avg       0.95      0.95      0.95       148



In [39]:
X_test[:5]

819    switchabl lumen and cct technolog maxim invent...
823    srt edgelit ceilingsurfacegarag currentlightin...
804    celem recess emerg light currentlightingcomcom...
970    ten standard design make up the cscx line of s...
677    omni low voltag ultrason ceil sensor ceil moun...
Name: Cleaned_Content, dtype: object

In [40]:
y_test[:5]

819    1
823    1
804    1
970    0
677    0
Name: target, dtype: int64

In [41]:
y_pred[:5]

array([1, 1, 1, 0, 0])

In [42]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[61  8]
 [ 0 79]]


In [43]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions on the training data
y_train_pred = clf_nb.predict(X_train)

# Print classification report for training data
print("Classification Report for Training Data:")
print(classification_report(y_train, y_train_pred))

# Calculate accuracy for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
print("Accuracy on Training Data:", accuracy_train)

# Generate confusion matrix for training data
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix for Training Data:")
print(conf_matrix_train)


Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.99      0.92      0.95       275
           1       0.93      0.99      0.96       313

    accuracy                           0.96       588
   macro avg       0.96      0.95      0.96       588
weighted avg       0.96      0.96      0.96       588

Accuracy on Training Data: 0.95578231292517
Confusion Matrix for Training Data:
[[252  23]
 [  3 310]]


USER INPUT

In [44]:
message = "ten standard design make up the cscx line of s"
result = clf_nb.predict([message])
print("Result: ", result[0])


Result:  0


MISSCLASSIFICATIONS

In [45]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for training data
train_results = pd.DataFrame({'ID': X_train.index,
                              'URL': df.loc[X_train.index, 'URL'],
                              'Actual': y_train,
                              'Predicted': y_train_pred})

# Find misclassifications in the training data
train_misclassifications = train_results[train_results['Actual'] != train_results['Predicted']]

# Print misclassifications
print("Misclassifications in the training data:")
print(train_misclassifications)


Misclassifications in the training data:
      ID                                                URL  Actual  Predicted
770  770  https://cdn.currentlighting.com/site/specsheet...       1          0
498  498  https://www.jlc-tech.com/wp-content/uploads/20...       0          1
104  104  https://www.houseofantiquehardware.com/Product...       0          1
124  124  https://www.sstlighting.com/CatalogPages/Drive...       0          1
490  490  https://www.jlc-tech.com/wp-content/uploads/20...       0          1
313  313  https://www.mullanlighting.com/us/mst_attachme...       0          1
538  538  https://prod-edam.honeywell.com/content/dam/ho...       1          0
90    90  https://www.b-td.com/s/INOS_SideTable_TechShee...       0          1
780  780  https://cdn.currentlighting.com/site/specsheet...       0          1
927  927  https://matthewsfanco.com/wp-content/uploads/2...       0          1
832  832  https://www.pennconduit.com/wp-content/uploads...       0          1
675  675  h

In [46]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for test data
test_results = pd.DataFrame({'ID': X_test.index,
                             'URL': df.loc[X_test.index, 'URL'],
                             'Actual': y_test,
                             'Predicted': y_pred})

# Find misclassifications in the test data
test_misclassifications = test_results[test_results['Actual'] != test_results['Predicted']]

# Print misclassifications
print("Misclassifications in the test data:")
print(test_misclassifications)


Misclassifications in the test data:
      ID                                                URL  Actual  Predicted
111  111  https://www.houseofantiquehardware.com/Product...       0          1
80    80  https://cdn2.toro.com/en/-/media/Files/Unique/...       0          1
491  491  https://www.jlc-tech.com/wp-content/uploads/20...       0          1
114  114    http://primolanterns.com/img/Drawings/AT-22.pdf       0          1
428  428  https://wp4d77.p3cdn1.secureserver.net/wp-cont...       0          1
359  359  http://solidapollo.com/PDF/Magro-RGB-W-Control...       0          1
833  833  https://www.pennconduit.com/wp-content/uploads...       0          1
698  698  http://www.puroluxco.com/__static/0cb1f0423f88...       0          1


SUPPORT VECTOR MACHINES

In [47]:
from sklearn.svm import SVC

clf_sv = Pipeline([
('vectorizer_tfidf', TfidfVectorizer()),
('MNN', LinearSVC())
])

clf_sv.fit(X_train, y_train)

y_pred = clf_sv.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        69
           1       0.98      1.00      0.99        79

    accuracy                           0.99       148
   macro avg       0.99      0.99      0.99       148
weighted avg       0.99      0.99      0.99       148



In [48]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[67  2]
 [ 0 79]]


In [49]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions on the training data
y_train_pred = clf_sv.predict(X_train)

# Print classification report for training data
print("Classification Report for Training Data:")
print(classification_report(y_train, y_train_pred))

# Calculate accuracy for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
print("Accuracy on Training Data:", accuracy_train)

# Generate confusion matrix for training data
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix for Training Data:")
print(conf_matrix_train)


Classification Report for Training Data:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       275
           1       0.99      1.00      0.99       313

    accuracy                           0.99       588
   macro avg       0.99      0.99      0.99       588
weighted avg       0.99      0.99      0.99       588

Accuracy on Training Data: 0.9931972789115646
Confusion Matrix for Training Data:
[[271   4]
 [  0 313]]


In [50]:
X_test[:5]

819    switchabl lumen and cct technolog maxim invent...
823    srt edgelit ceilingsurfacegarag currentlightin...
804    celem recess emerg light currentlightingcomcom...
970    ten standard design make up the cscx line of s...
677    omni low voltag ultrason ceil sensor ceil moun...
Name: Cleaned_Content, dtype: object

In [51]:
y_test[:5]

819    1
823    1
804    1
970    0
677    0
Name: target, dtype: int64

In [52]:
y_pred[:5]

array([1, 1, 1, 0, 0])

In [53]:
X_test.shape

(148,)

In [54]:
message = "ten standard design make up the cscx line of s"
result = clf_sv.predict([message])
print("Result: ", result[0])


Result:  0


In [55]:
#X_train[:5]

In [56]:
#y_train[:5]

USER INPUT

In [57]:
message = "ten standard design make up the cscx line of s"
result = clf_sv.predict([message])
print("Result: ", result[0])


Result:  0


MISSCLASSIFICATIONS

In [58]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for training data
train_results = pd.DataFrame({'ID': X_train.index,
                              'URL': df.loc[X_train.index, 'URL'],
                              'Actual': y_train,
                              'Predicted': y_train_pred})

# Find misclassifications in the training data
train_misclassifications = train_results[train_results['Actual'] != train_results['Predicted']]

# Print misclassifications
print("Misclassifications in the training data:")
print(train_misclassifications)


Misclassifications in the training data:
      ID                                                URL  Actual  Predicted
313  313  https://www.mullanlighting.com/us/mst_attachme...       0          1
315  315  https://www.mullanlighting.com/us/mst_attachme...       0          1
110  110  https://www.houseofantiquehardware.com/Product...       0          1
314  314  https://www.mullanlighting.com/us/mst_attachme...       0          1


In [59]:
# Create a DataFrame to store ID, URL, actual label, and predicted label for test data
test_results = pd.DataFrame({'ID': X_test.index,
                             'URL': df.loc[X_test.index, 'URL'],
                             'Actual': y_test,
                             'Predicted': y_pred})

# Find misclassifications in the test data
test_misclassifications = test_results[test_results['Actual'] != test_results['Predicted']]

# Print misclassifications
print("Misclassifications in the test data:")
print(test_misclassifications)


Misclassifications in the test data:
      ID                                                URL  Actual  Predicted
111  111  https://www.houseofantiquehardware.com/Product...       0          1
114  114    http://primolanterns.com/img/Drawings/AT-22.pdf       0          1
