In [42]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout

In [43]:
df = pd.read_csv('C:/Users/Shakil/Downloads/Compressed/market_comments.csv')
df.head()

Unnamed: 0,item_category,item_id,brand,user_id,date,comment,rating,tonality
0,401,4010201,826,2217,2013-06-28,"2,5 года работала и все...устала! Лампочка гор...",2.0,negative
1,403,4030101,1425,1026,2010-07-04,Через 2 месяца после истечении гарантийного ср...,2.0,negative
2,401,4010401,124,2769,2010-05-27,пользуюсь уже три недели. нареканий ни каких н...,4.0,positive
3,203,2030301,93,508,2016-10-11,Ребят этот системный блок подойдёт для игры кс...,5.0,positive
4,205,2050201,656,1049,2010-02-26,"я считаю, что яри замечательный телефон! Прият...",5.0,positive


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14281 entries, 0 to 14280
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   item_category  14281 non-null  int64  
 1   item_id        14281 non-null  int64  
 2   brand          14281 non-null  int64  
 3   user_id        14281 non-null  int64  
 4   date           14281 non-null  object 
 5   comment        14281 non-null  object 
 6   rating         14281 non-null  float64
 7   tonality       14281 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 892.7+ KB


In [45]:
df.isnull().sum()

item_category    0
item_id          0
brand            0
user_id          0
date             0
comment          0
rating           0
tonality         0
dtype: int64

In [53]:
# Preprocess the data
def preprocess_data(df):
    df = df.dropna(subset=['comment', 'tonality']) #drop rows with missing values
    df = df[['comment', 'tonality']] # filter relevant columns
    le = LabelEncoder()
    df['tonality'] = le.fit_transform(df['tonality']) #convert tonality to binary classification
    return df, le

In [54]:
# split data into training and test
def split_data(df):
    x = df['comment']
    y = df['tonality']
    xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2,
    random_state=42)
    return xtrain, xtest, ytrain, ytest

In [55]:
# tokenize and pad the sequences
def prepare_text_data(xtrain, xtest, max_words=10000, max_len=100):
    tokenizer = Tokenizer(num_words = max_words)
    tokenizer.fit_on_texts(xtrain)
    xtrain_seq = tokenizer.texts_to_sequences(xtrain)
    xtest_seq = tokenizer.texts_to_sequences(xtest)
    xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len, padding='post')
    xtest_pad = pad_sequences(xtest_seq, maxlen=max_len, padding='post')
    return xtrain_pad, xtest_pad, tokenizer

In [49]:
# build neural network model
def build_model(input_shape):
    model = Sequential()
    # embedding to transform words into dense vectors
    model.add(Embedding(input_dim=10000, output_dim=128)
    # LSTM layer for sequence processing
    model.add(LSTM(64))
    # Dropout for regularization
    model.add(Dropout(0.5))
    # Dense layer for classification
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [50]:
# train and evaluate the model
def train_and_evaluate(xtrain, ytrain, xtest, ytest, model):
    # train the model
    model.fit(xtrain, ytrain, epochs=6, batch_size=16, validation_data=(xtest, ytest))
    # evaluate the model
    loss, accuracy = model.evaluate(xtest, ytest)
    print('Test Loss:', loss)
    print('Test Accuracy:', accuracy)

In [57]:
# run the entire pipeline
def main():
    # Preprocess the data
    processed_data, label_encoder = preprocess_data(df)
    xtrain, xtest, ytrain, ytest = split_data(processed_data)

    # Prepare the text data
    xtrain_pad, xtest_pad, tokenizer = prepare_text_data(xtrain, xtest)

    # Build the model
    model = build_model(xtrain_pad.shape)

    # Train and evaluate the model
    train_and_evaluate(xtrain_pad, ytrain, xtest_pad, ytest, model)

In [58]:
# calling main func to execute
if __name__ == "__main__":
    main()



Epoch 1/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 52ms/step - accuracy: 0.8429 - loss: 0.4559 - val_accuracy: 0.8327 - val_loss: 0.4507
Epoch 2/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 52ms/step - accuracy: 0.8368 - loss: 0.4482 - val_accuracy: 0.8327 - val_loss: 0.4500
Epoch 3/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 51ms/step - accuracy: 0.8465 - loss: 0.4280 - val_accuracy: 0.8309 - val_loss: 0.4556
Epoch 4/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 53ms/step - accuracy: 0.8473 - loss: 0.4200 - val_accuracy: 0.8327 - val_loss: 0.3284
Epoch 5/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.8820 - loss: 0.2661 - val_accuracy: 0.9027 - val_loss: 0.2453
Epoch 6/6
[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 53ms/step - accuracy: 0.9504 - loss: 0.1402 - val_accuracy: 0.9034 - val_loss: 0.2711
[1m90/90[0m [32m━━━