In [1]:
import time
import os
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dense, GRU, Embedding, LSTM,Conv1D
from tensorflow.keras.optimizers import Adam,Adagrad,Nadam
from tensorflow.keras.layers import Dropout,GlobalMaxPooling1D,Conv1D,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
import gc
import matplotlib.pyplot as plt
import random
import numpy as np
from keras.models import Sequential
from sklearn.utils.class_weight import compute_class_weight
from itertools import product
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

from metrics import *

In [2]:
def process_pair(pair, th):
        if pair[0] > pair[1]:
            if pair[0] >= th:
                pair[0], pair[1] = 1, 0
            elif pair[0] < th:
                pair[0], pair[1] = 0, 0
        elif pair[0] < pair[1]:
            if pair[1] >= th:
                pair[0], pair[1] = 0, 1
            elif pair[1] < th:
                pair[0], pair[1] = 0, 0
        return pair
def threshold(pair, th):
    for i in range(len(pair)):
        # Processing pairs
        for j in range(0, len(pair[i])-1, 2):
            pair[i][j], pair[i][j+1] = process_pair([pair[i][j], pair[i][j+1]], th)
        
        # If the number of members is odd, process the last member
        if len(pair[i]) % 2 != 0:
            last_elem = pair[i][-1]
            if last_elem >= th:
                pair[i][-1] = 1
            else:
                pair[i][-1] = 0

    return pair


In [3]:
data=pd.read_excel('clean_dataset.xlsx')
df_comment = pd.DataFrame(data['clean_text'])
df_target=data.drop(['Brand', 'Name', 'Author_name', 'Date', 'Offer', 'Store_name', 'Model',
       'Comment','clean_text'],axis=1)
# shuffel
X_train, X_test, y_train, y_test = train_test_split(df_comment, df_target, test_size=0.3, random_state=42,shuffle=True)

In [4]:
X_train

Unnamed: 0,clean_text
714,لباسشویی پاکشوما چندسال دارم عالی بود بخاطر ظر...
834,دوستان. محصول دیگه رسید خونه محصول صدمه دیده ب...
29,ظرف‌های کثیف خیلی خوب تمیز میکنه تمیزی عالیه ی...
745,خرید خوبی بود طراحی رنگش زیاد جالب نبود
165,تکنسین نسب سر ساعت اندازی محل حاضر بودن
...,...
1044,جنس بدنه خیلی ضعیقه تقریبا نازک کار گزاشته شده...
1095,۶ماه دیگه خونه میخام بازش الانم باگارانتیش تما...
1130,اشعار کلیشهای مثب منفی بخونید.
860,کیفیت ساخت تحسین قبولی دارد.


In [5]:
X_train_list = X_train['clean_text'].values.tolist()
X_test_list = X_test['clean_text'].values.tolist()
y_test_list=y_test.values
y_train_list=y_train.values

In [6]:
#Count the number of unique words
nltk.download('punkt_tab')
all_words=' '.join(X_train_list)
all_words=word_tokenize(all_words)
dist=nltk.FreqDist(all_words)
num_unique_words=len(dist)
print(num_unique_words)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\BOY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
3004


In [7]:
#Count the langest text length
sentences_len=[]
for text in X_train_list:
    words_list=word_tokenize(text)
    l=len(words_list)
    sentences_len.append(l)   
max_review_len=np.max(sentences_len)
print(max_review_len)

890


In [8]:
# Create Tokenizer to convert texts to numeric tokens
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(X_train_list)
x_train_tokens = tokenizer.texts_to_sequences(X_train_list)
x_test_tokens = tokenizer.texts_to_sequences(X_test_list)
# Padding as long as the max_review_len
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_review_len,padding='post')
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_review_len,padding='post')
# dict token -> index
idx = tokenizer.word_index
# dict index -> token
inverse_map = dict(zip(idx.values(), idx.keys()))
# def tokens_to_string(tokens):
#     words = [inverse_map[token] for token in tokens if token!=0]
#     text = ' '.join(words)
#     return text

In [9]:
# y_train_list = np.array(y_train_list, dtype=np.float32)
# x_train_pad = np.array(x_train_pad, dtype=np.float32)

In [None]:
y_train_list

In [11]:
#Hyperparameter optimization
#Grid search

# setting
best_accuracy = 0
target_count = len(df_target.columns)
results=[]
data_dir = 'weights'
embedding_size = 300
search_log = []

learning_rates = [1e-2, 1e-3, 1e-4]
dropouts = [0.3, 0.4, 0.5]
filters_list = [64, 128, 256]
param_grid = list(product(learning_rates, dropouts, filters_list))

for trial, (lr, dropout_val, filters) in enumerate(param_grid):
    print(f"\n🔁 Grid Search Trial {trial+1}/{len(param_grid)}")
    print(f"🧪 Trying with lr={lr}, dropout={dropout_val}, filters={filters}")
    #Split the data into training and testing (here we use the full data for training)
    x_train_fold, x_val_fold, y_train_fold, y_val_fold = train_test_split(x_train_pad, y_train_list, test_size=0.2, random_state=42)
    
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_fold), y=y_train_fold.flatten())
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    
    model = tf.keras.Sequential([
        Embedding(input_dim=num_unique_words, output_dim=300, name='embedding_layer'),
        Dropout(dropout_val),
        Conv1D(filters, kernel_size=3, padding='same', activation='relu', strides=1),
        BatchNormalization(),
        GlobalMaxPooling1D(),
        Dense(256, activation='relu'),
        Dropout(dropout_val),
        Dense(target_count, activation='sigmoid')
    ])

    optimizer = Nadam(learning_rate=lr)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=[accuracy_multi_tf])

        # Callbacks
    early_stopping = EarlyStopping(
            monitor='val_accuracy_multi_tf',
            mode='max',
            patience=3,
            restore_best_weights=True,
            verbose=1
        )

    # Traning model
    start_time = time.time()
    history = model.fit(
        x_train_fold, y_train_fold,
        validation_data=(x_val_fold, y_val_fold),  
        epochs=20,
        batch_size=64,
        shuffle=True,
        verbose=1,
        class_weight=class_weight_dict,
        callbacks=[early_stopping]
    )


    # Predicts
    predictions = model.predict(x_test_pad)
    threshold(predictions, 0.5) 
    
    # Evaluate model
    metrics = evaluate_model(y_test_list, predictions)
    results.append(metrics)

    search_log.append({
        'trial': trial + 1,
        'learning_rate': lr,
        'dropout': dropout_val,
        'filters': filters,
        'accuracy': metrics['accuracy']
    })

    print(f" Accuracy: {metrics['accuracy']:.4f}")


    elapsed_time = time.time() - start_time
    print(f"Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed_time))}")

# تبدیل نتایج به دیتافریم
df = pd.DataFrame(search_log)

# نمایش جدول بهترین نتایج
best_results = (
    df.groupby('trial')
      .apply(lambda x: x.loc[x['accuracy'].idxmax()])
      .reset_index(drop=True)
      .sort_values(by='accuracy', ascending=False)
)
print("\n بهترین نتایج:")
print(best_results)




🔁 Grid Search Trial 1/27
🧪 Trying with lr=0.01, dropout=0.3, filters=64
Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 303ms/step - accuracy_multi_tf: 0.0664 - loss: 0.6196 - val_accuracy_multi_tf: 0.0000e+00 - val_loss: 0.5225
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 287ms/step - accuracy_multi_tf: 0.1977 - loss: 0.2495 - val_accuracy_multi_tf: 0.0568 - val_loss: 0.4653
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 294ms/step - accuracy_multi_tf: 0.4886 - loss: 0.1917 - val_accuracy_multi_tf: 0.0820 - val_loss: 0.3678
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 295ms/step - accuracy_multi_tf: 0.6036 - loss: 0.1674 - val_accuracy_multi_tf: 0.2703 - val_loss: 0.3133
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 284ms/step - accuracy_multi_tf: 0.7176 - loss: 0.1136 - val_accuracy_multi_tf: 0.2885 - val_loss: 0.3048
Epoch 6/20
[1m11/11[0

  .apply(lambda x: x.loc[x['accuracy'].idxmax()])


In [None]:
# 11   12.0         0.0010      0.3    256.0  0.643156

In [12]:
def build_model(num_unique_words, embedding_size, target_count):
    model = Sequential([
        Embedding(input_dim=num_unique_words,
                  output_dim=embedding_size,
                  name='embedding_layer'),
        Dropout(0.3),
        Conv1D(256, kernel_size=3, padding='same', activation='relu', strides=1),
        BatchNormalization(),
        GlobalMaxPooling1D(),
        Dense(200, activation='relu'),
        Dropout(0.3),
        Dense(target_count, activation='sigmoid')  # Multi-label classification
    ])
    optimizer = Nadam(learning_rate=1e-3)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=[accuracy_multi_tf])  
    return model

In [14]:

data_dir = 'weights'
kfold = KFold(5, shuffle=True, random_state=42)
results = []
target_count = len(df_target.columns)
embedding_size = 300
best_accuracy=0

for i in range(5):
    print(f"Run {i+1}...")
    for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(x_train_pad, y_train_list)):
        start_time = time.time()
        print(f"Run fold {fold_idx + 1}...")

        # تقسیم داده‌ها به دسته‌های آموزش و اعتبارسنجی
        x_train_fold, y_train_fold = x_train_pad[train_idx], y_train_list[train_idx]
        x_val_fold, y_val_fold = x_train_pad[val_idx], y_train_list[val_idx]

        # محاسبه وزن‌های کلاس‌ها
        # برای محاسبه وزن‌ها باید از labels در y_train استفاده کنیم
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train_fold), y=y_train_fold.flatten())
        class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

        # ساخت مدل
        model = build_model(num_unique_words, embedding_size, target_count)

        # Callbacks
        early_stopping = EarlyStopping(
            monitor='val_accuracy_multi_tf',
            mode='max',
            patience=3,
            restore_best_weights=True,
            verbose=1
        )

        checkpoint_path = os.path.join(data_dir, "output_CNN", f"best_model_run{i}_fold{fold_idx}.keras")
        model_checkpoint = ModelCheckpoint(
            filepath=checkpoint_path,
            monitor='val_accuracy_multi_tf',  
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        )

        # آموزش مدل با استفاده از وزن‌های کلاس
        history = model.fit(
            x_train_fold, y_train_fold,
            epochs=20,
            batch_size=64,
            shuffle=True,
            verbose=1,
            validation_data=(x_val_fold, y_val_fold),
            class_weight=class_weight_dict,  
            callbacks=[early_stopping, model_checkpoint]
        )

        # پیش‌بینی‌ها
        predictions = model.predict(x_test_pad)
        threshold(predictions, 0.5)  # اعمال آستانه به پیش‌بینی‌ها

        # ارزیابی مدل با استفاده از تابع دقت 
        metrics = evaluate_model(y_test_list, predictions)
        results.append(metrics)
        print(metrics)
        if metrics['accuracy']>best_accuracy:
            best_accuracy=metrics['accuracy']
        # گزارش زمان
        elapsed_time = time.time() - start_time
        print(f"Run fold {fold_idx + 1} completed in {time.strftime('%H:%M:%S', time.gmtime(elapsed_time))}")
print('Best accurassy is: ', best_accuracy)

Run 1...
Run fold 1...
Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy_multi_tf: 0.0606 - loss: 0.8073 
Epoch 1: val_accuracy_multi_tf improved from -inf to 0.00000, saving model to weights\output_CNN\best_model_run0_fold0.keras
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy_multi_tf: 0.0595 - loss: 0.7890 - val_accuracy_multi_tf: 0.0000e+00 - val_loss: 0.6475
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy_multi_tf: 0.0602 - loss: 0.3289 
Epoch 2: val_accuracy_multi_tf did not improve from 0.00000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step - accuracy_multi_tf: 0.0612 - loss: 0.3283 - val_accuracy_multi_tf: 0.0000e+00 - val_loss: 0.6359
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy_multi_tf: 0.0598 - loss: 0.2821 
Epoch 3: val_accuracy_multi_tf improved from 0.00000 to 0.12461, savi

In [None]:
from tensorflow.keras.models import load_model
model = load_model("weights/output/best_model_run0_fold3.keras")
preds = model.predict(x_new_data)
