In [632]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
import timeit

scores = {}
times = {}

In [633]:
df = pd.read_csv("devices-products.csv")
df.head()

Unnamed: 0,Product,Category
0,Apple 13-inch MacBook Air (M1 CPU) 256GB - 2020,Laptop
1,Apple 13-inch MacBook Air (M1 CPU) 512GB - 2020,Laptop
2,Apple 13-inch MacBook Air with Retina display ...,Laptop
3,Apple 13-inch MacBook Air with Retina display ...,Laptop
4,Apple 13-inch MacBook Pro (1.4GHz quad-core CP...,Laptop


In [634]:
df.shape

(1226, 2)

In [635]:
df['Category'].value_counts()

Category
Laptop            452
Monitor           296
Desktop           259
Server             55
Smartphone         50
IoT                30
Tablet             22
Thin Client        16
Printer            11
Hard drive         11
Gaming              5
Workstation         4
Multimedia          4
Network             4
Entertainment       2
Converged Edge      2
Converged           2
SAN/NAS             1
Name: count, dtype: int64

In [636]:
x = df['Product'].values
y = df['Category'].values

In [637]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [638]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)

In [639]:
start = timeit.default_timer()
clf_random_forest = RandomForestClassifier()
clf_random_forest.fit(tfidf_train_vectors, y_train)
y_random_forest_pred = clf_random_forest.predict(tfidf_test_vectors)
times['random_forest'] = timeit.default_timer() - start

In [640]:
df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_random_forest_pred, 
        'real_category': y_test
    }, 
    columns=['product', 'predicted_category', 'real_category'])
df_compare

Unnamed: 0,product,predicted_category,real_category
0,Lenovo ThinkPad T440s,Laptop,Laptop
1,Apple 13-inch MacBook Pro (2.0GHz quad-core CP...,Laptop,Laptop
2,Lenovo IdeaPad Slim 7 15/Yoga Slim 7 15,Laptop,Laptop
3,Dell E2220H Monitor,Monitor,Monitor
4,Lexmark CX924dxe,Printer,Printer
...,...,...,...
241,Dell PowerEdge T130,Server,Server
242,Apple 14-inch MacBook Pro with 64GB,Laptop,Laptop
243,Seagate Makara HDD 8TB,Hard drive,Hard drive
244,HP E24i G4 HO,Monitor,Monitor


In [641]:
scores['random_forest'] = accuracy_score(y_test, y_random_forest_pred)

In [642]:
future_x_test = tfidf_vectorizer.transform(['Samsung Galaxy S39', 'iPad Future'])

future_y_pred = clf_random_forest.predict(future_x_test)
future_y_pred

array(['Smartphone', 'Tablet'], dtype=object)

In [643]:
start = timeit.default_timer()

clf_knn = KNeighborsClassifier(n_neighbors=19)
clf_knn.fit(tfidf_train_vectors, y_train)
y_knn_pred = clf_knn.predict(tfidf_test_vectors)
scores['k_nearest_neighbors'] = accuracy_score(y_test, y_knn_pred)
times['k_nearest_neighbors'] = timeit.default_timer() - start

In [644]:
y_pred = clf_knn.predict(future_x_test)
y_pred

array(['Laptop', 'Laptop'], dtype=object)

In [645]:
start = timeit.default_timer()

clf_nb = MultinomialNB()
clf_nb.fit(tfidf_train_vectors, y_train)
y_nb_pred = clf_nb.predict(tfidf_test_vectors)
scores['naive_baynes'] = accuracy_score(y_test, y_nb_pred)
times['naive_baynes'] = timeit.default_timer() - start


In [646]:
y_pred = clf_nb.predict(future_x_test)
y_pred

array(['Laptop', 'Laptop'], dtype='<U14')

In [647]:
start = timeit.default_timer()

clf_svc = LinearSVC(dual=True)
clf_svc.fit(tfidf_train_vectors, y_train)
y_svc_pred = clf_svc.predict(tfidf_test_vectors)
scores['support_vector_machines'] = accuracy_score(y_test, y_svc_pred)
times['support_vector_machines'] = timeit.default_timer() - start

In [648]:
y_pred = clf_svc.predict(future_x_test)
y_pred

array(['Smartphone', 'Tablet'], dtype=object)

In [649]:
start = timeit.default_timer()

clf_logreg = LogisticRegression()
clf_logreg.fit(tfidf_train_vectors, y_train)
y_logreg_pred = clf_logreg.predict(tfidf_test_vectors)
scores['logistic_regression'] = accuracy_score(y_test, y_logreg_pred)
times['logistic_regression'] = timeit.default_timer() - start

In [650]:
y_pred = clf_logreg.predict(future_x_test)
y_pred

array(['Smartphone', 'Laptop'], dtype=object)

In [651]:
algorithms = [
    'random_forest', 
    'k_nearest_neighbors', 
    'naive_baynes', 
    'support_vector_machines', 
    'logistic_regression'
]

for i in range(5):
    print(f"{algorithms[i]}: {scores[algorithms[i]]:.2%} ({times[algorithms[i]]:.2}s)")

random_forest: 91.87% (0.24s)
k_nearest_neighbors: 85.77% (0.024s)
naive_baynes: 85.37% (0.0069s)
support_vector_machines: 96.75% (0.022s)
logistic_regression: 91.46% (0.14s)
