# Read dataset

In [None]:
import os

import numpy as np
import pandas as pd
import re

In [None]:
data_folder = "../data"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

path = os.path.join(data_folder, "all_data.parquet")

In [None]:
read_data = pd.read_parquet(path)
read_data['category'] = read_data['category_id'].astype(str) + '_' + read_data['category_name']

In [None]:
read_data

# Label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
label_encoder.fit(read_data['category'])

In [None]:
label_encoder.classes_

In [None]:
read_data['target'] = label_encoder.transform(read_data['category'])

In [None]:
label_encoder.transform(['99_UPS - Nepertraukiamo maitinimo šaltiniai'])

# Split dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, test_data = train_test_split(read_data, test_size=0.15, random_state=42)

In [None]:
train_data

In [None]:
test_data

# Text cleaner

In [None]:
colors = [
    'blue', 'melynas', 'melyna', 'melynos',
    'green', 'zalias', 'zalia','zalios',
    'orange', 'oranzinis', 'oranzine', 'oranzines',
    'white', 'baltas', 'balta',  'baltos',
    'black', 'juodas', 'juoda',  'juodos',
    'yellow','geltonas', 'geltona',  'geltonos',
    'red', 'raudonas', 'raudona',  'raudonos',
    'purple', 'violetinis', 'violetine',  'violetines',
    'pink', 'rozines', 'rozine',  'rozinis',
    'violet',
    'gray',  'pilkas', 'pilkos', 'pilkai', 'pilka',
    'sidabrinis', 'sidabrines', 'sidabro',
    'starlight',
    'twilight',
    'light',
    'cool',
    'grey',
    'burgundija',
    'titanas', 'titano',
    'brown', 'rudas', 'rudos', 'ruda',
    'navy',
    'clush',
    'rush',
    'amber', 'gintarinis', 'gintarinęs',
    'cobalt',
    'marble',
    'mystic',
    'phantom',
    'auksas', 'auksines', 'auksine', 'aukso', 'auksinis',
    'rausvos', 'rausvai', 'rausvo',
    'kremines', 'kreminis', 'kremine',
    'cream',
    'mint',
    'lime',
    'grafit',
    'beige',
    'grafitas',
    'tamsiai', 'tamsus',
    'sviesiai', 'sviesus',
    'bronzinis', 'bronzine', 'bronzos',
    'sidabrinis', 'sidabrine', 'sidabro',
    'icy blue',
    'rusvos', 'rusvai', 'rusva',
    'orchid',
    'lavender',
    'graphite',
    'midnight',
    'gold',
    'silver',
    'coral',
    'rozinis',
    'zalias',
    'vidurnaktis',
]

In [None]:
def additional_cleaning(text):
    # Replace inch symbol
    text = re.sub(r'\d+"', 'ISTRIZAINE', text)

    pattern = re.compile("|".join(map(re.escape, colors)))
    text = pattern.sub('SPALVA', text)

    return text

In [None]:
PUNCTUATION = "!#$%&'()*+,.:;<=>?@[\]^_`{|}~/"

def clean_text(text):
    text = text.lower()

    replacements = {
        'ą': 'a', 'č': 'c', 'ę': 'e', 'ė': 'e', 'į': 'i', 'š': 's', 'ų': 'u', 'ū': 'u', 'ž': 'z',
    }
    for lt_char, replacement in replacements.items():
        text = text.replace(lt_char, replacement)

    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')

    # Dropping quatattions
    # text = text.replace('"', '')
    # text = text.replace("'", '')
    # text = text.replace("`", '')
    text = text.replace("-", '')
    text = text.replace("™", '')
    text = text.replace(",", '')
    text = text.replace("„", '')
    text = text.replace("“", '')

    # remove some phrases
    text = re.sub(r'atgauk \d+%', '', text)
    text = re.sub(r'preke po grazinimo', '', text)

    # addition cleaning
    text = additional_cleaning(text)



    text = re.sub(f'[{PUNCTUATION}]', '', text)


    # Cleaning multiple spaces
    text = ' '.join(text.split())

    return text

In [None]:
str = 'white F2.8 27.2", 4 ms NeoGlass™'

str = clean_text(str)
str

# Vektorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
stop_words = [
    'ir',
    'nuolaida',
    'ispardavimas',
    'preke po grazinimo',
    'kaina',
    'akcija',
    'top',
    'preke',
    'grazinimo',
    'kita',
    'ekspozicine',
    'uzsakomoji'
    'ecost',
    'be',
    'without'
    'preke',
    'su'
    'spalvos',#? nesu tikras
    'spalva',#? nesu tikras
    'pazeista',
    'pakuote',
]

In [None]:
# vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3, min_df=3, stop_words=stop_words, ngram_range=(1, 3), lowercase=True, strip_accents='ascii')
vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.2, min_df=3,  ngram_range=(1, 3), preprocessor=clean_text, analyzer='word', dtype=np.float32, max_features=300000)

In [None]:
vectorizer.fit(train_data['name'])

In [None]:
len(vectorizer.vocabulary_)

In [None]:
vectorizer.get_feature_names_out()[:10]

In [None]:
vectorizer.transform(['magnetinė šaškių ir šachmatų lenta qx#'])

In [None]:
vectorizer.vocabulary_.get('v')

# Price scaler

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(train_data[['price']])

# Prepare training adn testing data

In [None]:
from scipy.sparse import hstack

In [None]:
vectorized_train_text = vectorizer.transform(train_data['name'])
scaled_train_price = scaler.transform(train_data[['price']])

In [None]:
X_train = hstack([vectorized_train_text])

In [None]:
vectorized_test_text = vectorizer.transform(test_data['name'])
scaled_test_price = scaler.transform(test_data[['price']])

In [None]:
X_test = hstack([vectorized_test_text])

In [None]:
y_train = label_encoder.transform(train_data['category'])
y_test = label_encoder.transform(test_data['category'])

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
ct = ColumnTransformer(transformers=[
    ("name_preprocess", TfidfVectorizer(stop_words=stop_words, max_df=0.2, min_df=3,  ngram_range=(1, 3), preprocessor=clean_text, analyzer='word', max_features=300000), "name"),
    ("price_preprocess", MinMaxScaler(), ["price"])
])

In [None]:
X_train = ct.fit_transform(train_data)

In [None]:
X_test = ct.transform(test_data)

# Hide warnings

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Train

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score

In [None]:
max_iter = 5
log_losses = []
scores = []
validation_scores = []

In [None]:
lr = LogisticRegression(max_iter=1, solver='saga', warm_start=True)

In [None]:
for i in range(max_iter):
    lr.fit(X_train, y_train)

    # Calculate log loss for progress monitoring
    y_prob = lr.predict_proba(X_train)
    loss = log_loss(y_train, y_prob)
    log_losses.append(loss)

    # Track training accuracy
    acc = lr.score(X_train, y_train)
    scores.append(acc)

    # Track validation accuracy
    validation_acc = lr.score(X_test, y_test)
    validation_scores.append(validation_acc)

    # Print progress for transparency
    print(f"Iteration {i + 1}/{max_iter}: Log Loss = {loss:.4f}, Accuracy = {acc:.4f}, Validation accuracy = {validation_acc:.4f}")

# My own test

In [None]:
my_test_data = pd.DataFrame({
    "name": ['Samsung Galaxy S25 Rugged Cover Black'],
    "price": [3189]  # Price input
})

X_custom_test = ct.transform(my_test_data)
predicted_classes = lr.predict(X_custom_test)
class_probabilities = lr.predict_proba(X_custom_test)
max_probability = class_probabilities.max()
predicted_class_name = label_encoder.inverse_transform(predicted_classes)

print("Predicted class (name):", predicted_class_name[0])
print("Maximum probability (percent):", f"{max_probability * 100:.2f}%")

# Another train

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
clf = RidgeClassifier(max_iter=20, tol=1e-2, solver="sparse_cg")

In [None]:
clf.fit(X_train, y_train)

In [None]:
test_accuracy = clf.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# Step 1: Custom data preparation
my_test_data = pd.DataFrame({
    "name": ["Hewlett Packard (HP) also HP W9172MC Geltona Managed LJ Toneris"],  # Text input
    "price": [229900009]  # Price input
})

# Step 2: Transform using the trained ColumnTransformer
X_custom_test = ct.transform(my_test_data)

# Step 3: Predict using the RidgeClassifier
predicted_classes = clf.predict(X_custom_test)

# Step 4: Decode the predicted class labels
predicted_class_names = label_encoder.inverse_transform(predicted_classes)

# Output predictions
print("Predicted class (name):", predicted_class_names)


# Tuning hyper-parameters

In [None]:
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(max_iter=1, solver='saga')
param_grid = [
    {'C': [1, 10, 100], 'penalty': ['l1', 'l2', 'elasticnet', 'none']},
]

clf = GridSearchCV(lr, param_grid)
clf.fit(X_train, y_train)


# My own test

In [None]:
texts = [
    'Kabelis TV antenai RF, 2.5m, kištukas - lizdas, baltas EMOS'
]

input = vectorizer.transform(texts)

predicted_class = lr.predict(input)
class_probabilities = lr.predict_proba(input)
max_probability = class_probabilities.max()

predicted_class_name = label_encoder.inverse_transform(predicted_class)

print("Predicted class (name):", predicted_class_name[0])
print("Maximum probability (percent):", f"{max_probability * 100:.2f}%")

# Plot training result

In [None]:
plt.plot(log_losses)
plt.title('Log Loss')
plt.show()

plt.plot(scores)
plt.title('Accuracy')
plt.show()

plt.plot(validation_scores)
plt.title('Validation accuracy')
plt.show()

# Classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
pred = lr.predict(X_test)

In [None]:
classes_present = np.unique(np.concatenate([y_test, pred]))

In [None]:
reduced_target_names = [label_encoder.classes_[i] for i in classes_present]

In [None]:
classification_report_report = classification_report(y_test, pred, target_names=reduced_target_names, zero_division=0)

In [None]:
print(classification_report_report)

In [None]:
f1 = f1_score(y_test, pred, average='weighted')  # Options: 'micro', 'macro', 'weighted', 'binary'

print(f"F1 Score (weighted): {f1:.4f}")

# Confusion matrice

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
confusion_matrix_report = confusion_matrix(y_test, pred)

In [None]:
class1_name = '99_UPS - Nepertraukiamo maitinimo šaltiniai'
class2_name = '995_Kia deflektoriai'

# Transform the class names into their respective label-encoded indices
class1_index = label_encoder.transform([class1_name])[0]
class2_index = label_encoder.transform([class2_name])[0]

# Filter y_test and predictions for only these two classes
filter_mask = (y_test == class1_index) | (y_test == class2_index)
filtered_y_test = y_test[filter_mask]
filtered_pred = pred[filter_mask]

# Generate a confusion matrix for the selected classes
labels = [class1_index, class2_index]
cm = confusion_matrix(filtered_y_test, filtered_pred, labels=labels)

# Display the confusion matrix
display_labels = [class1_name, class2_name]
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
disp.plot()

# Show 20 best feature for particular class

In [None]:
def show_coefficients_for(class_name):
    print(f"Coefficients for class '{class_name}':")
    print('----------------------------')
    print('')
    class_index = label_encoder.transform([class_name])[0]
    class_coefficients = lr.coef_[class_index]
    feature_coefficients = dict(zip(vectorizer.get_feature_names_out(), class_coefficients))
    sorted_features = sorted(feature_coefficients.items(), key=lambda x: x[1], reverse=True)
    for feature, coefficient in sorted_features[:20]:
        print(f"{feature}: {coefficient:.4f}")


In [None]:
show_coefficients_for('1249_Blenderiai')