# 1. Preprocessing
## Import data

In [1]:
! pip install scikeras
! pip install pandas
! pip install nltk
! pip install scikit-learn




In [2]:
import pandas as pd
df = pd.read_csv('training.300000.processed.noemoticon.csv', encoding='ISO-8859-1')

## Text clean

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/liu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/liu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/liu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))


def clean_text(text):
    # transfer to lower characters
    text = text.lower()
    
    # eliminate url link
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # eliminate special character
    text = re.sub(r'@\w+|#\w+', '', text)
    
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # eliminate stop word
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
   
    return ' '.join(tokens)

# apply clean function to data frame
df['cleaned_text'] = df['text'].apply(clean_text)

df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok
1,"@misstoriblack cool , i have no tweet apps fo...",cool tweet apps razr
2,@TiannaChaos i know just family drama. its la...,know family drama lamehey next time u hang kim...
3,School email won't open and I have geography ...,school email wont open geography stuff revise ...
4,upper airways problem,upper airways problem


## Tokenization, Padding

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Define tokenizer
tokenizer = Tokenizer(num_words=10000)


tokenizer.fit_on_texts(df['cleaned_text'])

# Transfer to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])


# Padding sequence to a constant length
X = pad_sequences(sequences, maxlen=50)

# Transfer original labels to 0-1 label
y = df['sentiment'].values  
y = (y == 4).astype(int) 


2024-10-09 18:09:43.277613: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-09 18:09:43.416572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 18:09:43.468175: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 18:09:43.483072: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 18:09:43.575053: I tensorflow/core/platform/cpu_feature_guar

## Divide dataset into two parts, trainset and testset

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Deep learning

## CNN
Model implement

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Embedding, Dropout, Flatten, Input

# from previous block, which maximum length of sequence is 50
sequence_length = 50

# define cnn
model_1 = Sequential()

# add input layer
model_1.add(Input(shape=(sequence_length,)))

# add embedding layer, map word id to a vector
model_1.add(Embedding(input_dim=10000, output_dim=50))

# convolutional layer
model_1.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# pool layer
model_1.add(MaxPooling1D(pool_size=2))

# fc1
model_1.add(Flatten())

# Dropout layer
model_1.add(Dropout(0.5))

# fc2
model_1.add(Dense(64, activation='relu'))

# output layer
model_1.add(Dense(1, activation='sigmoid'))

model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# show model structure
model_1.summary()

model_1.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)
loss, accuracy = model_1.evaluate(X_test, y_test)


Epoch 1/10


I0000 00:00:1728457979.132687    2721 service.cc:146] XLA service 0x7fcf24017c90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728457979.132731    2721 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4080 Laptop GPU, Compute Capability 8.9
2024-10-09 18:12:59.153495: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m 128/1500[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 1ms/step - accuracy: 0.5253 - loss: 0.6876

I0000 00:00:1728457980.316615    2721 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6987 - loss: 0.5535 - val_accuracy: 0.7718 - val_loss: 0.4754
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7953 - loss: 0.4424 - val_accuracy: 0.7725 - val_loss: 0.4722
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8169 - loss: 0.4021 - val_accuracy: 0.7686 - val_loss: 0.4841
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8405 - loss: 0.3574 - val_accuracy: 0.7641 - val_loss: 0.5242
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8611 - loss: 0.3157 - val_accuracy: 0.7582 - val_loss: 0.5741
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8754 - loss: 0.2825 - val_accuracy: 0.7532 - val_loss: 0.6518
Epoch 7/10
[1m1500/1500[0

Fine tune

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Embedding, Dropout, Flatten, Input
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
import json


# Define cnn model structure
def create_model(optimizer='adam', filters=64, kernel_size=5, dropout_rate=0.5):
    sequence_length = 50  
    model = Sequential()
    model.add(Input(shape=(sequence_length,)))
    model.add(Embedding(input_dim=10000, output_dim=50))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model
# Wrape Keras model for sklearn
model_1_f = KerasClassifier(build_fn=create_model, epochs=10, batch_size=128, verbose=0)

# Search list
param_dist = {
    'model__optimizer': ['adam', 'rmsprop'],
    'model__filters': [32, 64, 128],
    'model__kernel_size': [3, 5],
    'model__dropout_rate': [0.2, 0.3],
    'epochs': [5, 10, 15],
    'batch_size': [32, 64]
}

random_search = RandomizedSearchCV(estimator=model_1_f, param_distributions=param_dist, n_iter=10, cv=3)
random_search.fit(X_train,y_train)
# save best paramaters combination
best_params = random_search.best_params_
with open('best_paramscnn.json', 'w') as f:
    json.dump(best_params, f)
#    
model_1_best=random_search.best_estimator_.model_
model_1_best.save("bestcnn.h5")


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


In [21]:
import json
from sklearn.metrics import accuracy_score

# 获取最佳参数
best_params = random_search.best_params_

# 将最佳参数保存为 JSON 文件
with open('best_paramscnn.json', 'w') as f:
    json.dump(best_params, f)

# 打印最佳参数
print(f"Best Parameters: {best_params}")

# 使用最佳模型对测试集进行评估
model_1_best = random_search.best_estimator_

# 评估模型并打印结果（例如使用准确率评估）
accuracy = model_1_best.score(X_test,y_test)
model_1_best=random_search.best_estimator_.model_
loss, accuracy= model_1_best.evaluate(X_test, y_test)
print(loss,accuracy)
model_1_best.save("bestcnn.h5")


Best Parameters: {'model__optimizer': 'rmsprop', 'model__kernel_size': 3, 'model__filters': 32, 'model__dropout_rate': 0.2, 'epochs': 15, 'batch_size': 32}
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7760 - loss: 0.4878




0.49118101596832275 0.777233362197876


## LSTM
model implement

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Input
from tensorflow.keras.optimizers import Adam
def create_lstm_model(optimizer='adam',output_dim=50 ,units=64,dropout_rate=0.5):
    model=Sequential()
    sequence_length=50
    model.add(Input(shape=(sequence_length,)))
    model.add(Embedding(input_dim=10000, output_dim=output_dim))
    model.add(LSTM(units=units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_2=create_lstm_model()
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_2.summary()
model_2.fit(X_train,y_train,epochs=10, batch_size=128, validation_split=0.2)
loss, accuracy=model_2.evaluate(X_test,y_test)
    
    

Epoch 1/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 52ms/step - accuracy: 0.7174 - loss: 0.5403 - val_accuracy: 0.7644 - val_loss: 0.4933
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 31ms/step - accuracy: 0.7878 - loss: 0.4478 - val_accuracy: 0.7733 - val_loss: 0.4723
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.8032 - loss: 0.4191 - val_accuracy: 0.7722 - val_loss: 0.4820
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.8135 - loss: 0.3995 - val_accuracy: 0.7704 - val_loss: 0.4929
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 38ms/step - accuracy: 0.8254 - loss: 0.3769 - val_accuracy: 0.7651 - val_loss: 0.5211
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 49ms/step - accuracy: 0.8402 - loss: 0.3506 - val_accuracy: 0.7641 - val_loss: 0.5627
Epoc

Fine Tune

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Input
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.optimizers import Adam

def create_lstm_model(optimizer='adam',output_dim=50 ,units=64,dropout_rate=0.5):
    model=Sequential()
    sequence_length=50
    model.add(Input(shape=(sequence_length,)))
    model.add(Embedding(input_dim=10000, output_dim=output_dim))
    model.add(LSTM(units=units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model
# Wrape Keras model for sklearn
model_2_f = KerasClassifier(build_fn=create_lstm_model, epochs=10, batch_size=128, verbose=0)
# Search list
param_dist = {
    'model__optimizer': ['adam', 'rmsprop'],
    'model__output_dim': [50, 100, 150,200],
    'model__units': [64, 128],
    'model__dropout_rate': [0.2, 0.3,0.5],
    'epochs': [5, 10, 15],
    'batch_size': [32, 64]
}
random_search = RandomizedSearchCV(estimator=model_2_f, param_distributions=param_dist, n_iter=10, cv=3)
random_search.fit(X_train,y_train)

# save best paramaters combination
best_params = random_search.best_params_
with open('best_paramslstm.json', 'w') as f:
    json.dump(best_params, f)
#    
model_2_best=random_search.best_estimator_.model_
model_2_best.save("bestlstm.h5")

  X, y = self._initialize(X, y)


KeyboardInterrupt: 

## Transformer

In [17]:
! pip install transformers
! pip install tf-keras



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.18,>=2.17 (from tf-keras)
  Downloading tensorflow-2.17.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow<2.18,>=2.17->tf-keras)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting tensorboard<2.18,>=2.17 (from tensorflow<2.18,>=2.17->tf-keras)
  Downloading tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tensorflow-2.17.0-cp310-cp310-macos

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from transformers import TFBertModel, BertConfig

# 设置序列长度
sequence_length = 50

# 定义模型配置
config = BertConfig(
    vocab_size=30522,         # 词汇表大小
    hidden_size=768,          # 隐藏层维度
    num_hidden_layers=6,      # Transformer 编码层的数量
    num_attention_heads=12,   # 自注意力头的数量
    intermediate_size=2048,   # 前馈网络的大小
    hidden_dropout_prob=0.1,  # Dropout 的概率
    max_position_embeddings=50  # 最大序列长度
)

# 使用配置创建一个 Bert 模型
transformer_model = TFBertModel(config)

# 定义输入层，数据类型为 int32
input_ids = Input(shape=(sequence_length,), dtype=tf.int32, name="input_ids")

# 使用 Lambda 层将 input_ids 从 KerasTensor 转换为标准 TensorFlow 张量，并指定 output_shape
input_ids_tf = Lambda(lambda x: tf.convert_to_tensor(x, dtype=tf.int32), output_shape=(sequence_length,))(input_ids)

# 将输入传递给 Transformer 模型
bert_output = transformer_model(input_ids_tf)[0]

# 只取序列的第一个位置的输出进行分类（通常是 [CLS] token）
cls_token_output = bert_output[:, 0, :]

# 添加一个全连接层用于分类
output = Dense(1, activation="sigmoid")(cls_token_output)

# 定义完整的模型
model = Model(inputs=input_ids, outputs=output)

# 编译模型
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss='binary_crossentropy', metrics=['accuracy'])

# 打印模型结构
model.summary()

# 训练模型的示例（假设你已经有X和y）
# history = model.fit(X, y, validation_split=0.2, epochs=3, batch_size=32)

# 评估模型（假设你有X_test和y_test）
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {accuracy}")


ValueError: Exception encountered when calling layer 'tf_bert_model_5' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for input_ids.

Call arguments received by layer 'tf_bert_model_5' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 50), dtype=float32, sparse=False, name=keras_tensor_156>
  • attention_mask=None
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False