In [3]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.1.24-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-2.5.0-py3-none-any.whl.metadata (6.1 kB)
Collecting grpcio<2.0,>=1.24.3 (from t

In [None]:
import boto3
import sagemaker
import numpy as np
import pandas as pd
import tensorflow as tf
from io import StringIO
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, SpatialDropout1D, Input, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import TrainingInput
import chardet

# Define S3 bucket and file paths
s3_bucket = "basehatespeech"
s3_data_path = "s3://basehatespeech/datasets1/re_dataset.csv"
s3_slang_path = "s3://basehatespeech/datasets1/new_kamusalay.csv"
s3_abusive_path = "s3://basehatespeech/datasets1/abusive.csv"

# Load datasets from S3
s3 = boto3.client('s3')

# Function to detect file encoding
def detect_encoding(bucket, key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    raw_data = obj['Body'].read(100000)  # Read first 100KB for detection
    result = chardet.detect(raw_data)
    return result['encoding']

# Function to load CSV from S3 with encoding detection
def load_s3_csv(bucket, key):
    encoding = detect_encoding(bucket, key)  # Detect encoding
    print(f"Detected encoding for {key}: {encoding}")

    obj = s3.get_object(Bucket=bucket, Key=key)
    data = obj['Body'].read().decode(encoding, errors='replace')  # Replace invalid characters
    return pd.read_csv(StringIO(data))

# Load datasets
df = load_s3_csv(s3_bucket, "datasets1/re_dataset.csv")
kamusalay = load_s3_csv(s3_bucket, "datasets1/new_kamusalay.csv")
abusive_words = set(load_s3_csv(s3_bucket, "datasets1/abusive.csv")["ABUSIVE"].tolist())

# Preprocessing
kamus_dict = dict(zip(kamusalay.iloc[:, 0], kamusalay.iloc[:, 1]))
def preprocess_text(text):
    text = str(text).lower()
    for slang, standard in kamus_dict.items():
        text = text.replace(slang, standard)
    return text

df['cleaned_tweet'] = df['Tweet'].apply(preprocess_text)
df['contains_abusive'] = df['cleaned_tweet'].apply(lambda text: int(any(word in abusive_words for word in text.split())))

# Tokenization
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_tweet'])
sequences = tokenizer.texts_to_sequences(df['cleaned_tweet'])
X_text = pad_sequences(sequences, maxlen=max_len)
X_extra = np.array(df[['contains_abusive']])

# Encoding labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['HS'])

# Split data
X_text_train, X_text_test, X_extra_train, X_extra_test, y_train, y_test = train_test_split(X_text, X_extra, y, test_size=0.2, random_state=42)

# Define model
text_input = Input(shape=(max_len,), name="text_input")
extra_input = Input(shape=(1,), name="extra_input")
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(text_input)
dropout = SpatialDropout1D(0.2)(embedding)
conv = Conv1D(64, kernel_size=5, activation='relu', padding='same')(dropout)
lstm = LSTM(64, return_sequences=False)(conv)
extra_dense = Dense(16, activation='relu')(extra_input)
merged = Concatenate()([lstm, extra_dense])
dense = Dense(64, activation='relu')(merged)
dropout_final = Dropout(0.5)(dense)
out = Dense(1, activation='sigmoid')(dropout_final)
model = Model(inputs=[text_input, extra_input], outputs=out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit([X_text_train, X_extra_train], y_train, validation_data=([X_text_test, X_extra_test], y_test), epochs=5, batch_size=32)

# Save model locally
model.save("hate_speech_cnn_rnn_model.h5")
print("Model saved locally as hate_speech_cnn_rnn_model.h5")

# Upload model to S3
model_s3_path = f"s3://{s3_bucket}/models/hate_speech_cnn_rnn_model.h5"
s3.upload_file("hate_speech_cnn_rnn_model.h5", s3_bucket, "models/hate_speech_cnn_rnn_model.h5")
print(f"Model uploaded to {model_s3_path}")

# Upload processed data to S3
train_data_path = f"s3://{s3_bucket}/train/"
test_data_path = f"s3://{s3_bucket}/test/"

np.save("X_text_train.npy", X_text_train)
s3.upload_file("X_text_train.npy", s3_bucket, "train/X_text_train.npy")
np.save("X_extra_train.npy", X_extra_train)
s3.upload_file("X_extra_train.npy", s3_bucket, "train/X_extra_train.npy")
np.save("y_train.npy", y_train)
s3.upload_file("y_train.npy", s3_bucket, "train/y_train.npy")

np.save("X_text_test.npy", X_text_test)
s3.upload_file("X_text_test.npy", s3_bucket, "test/X_text_test.npy")
np.save("X_extra_test.npy", X_extra_test)
s3.upload_file("X_extra_test.npy", s3_bucket, "test/X_extra_test.npy")
np.save("y_test.npy", y_test)
s3.upload_file("y_test.npy", s3_bucket, "test/y_test.npy")

# Define SageMaker training job
role = sagemaker.get_execution_role()
estimator = TensorFlow(
    entry_point='train_script.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='2.8',
    py_version='py39',
    hyperparameters={
        'epochs': 5,
        'batch_size': 32
    },
    output_path=f's3://{s3_bucket}/output/'
)

# Start training
estimator.fit({
    'train': TrainingInput(train_data_path),
    'test': TrainingInput(test_data_path)
})

print("SageMaker training job started!")


Detected encoding for datasets1/re_dataset.csv: ISO-8859-1


Detected encoding for datasets1/new_kamusalay.csv: ISO-8859-1


Detected encoding for datasets1/abusive.csv: ascii


2025-02-07 16:41:03.473931: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Epoch 1/5
[1m 66/330[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m27s[0m 104ms/step - accuracy: 0.5794 - loss: 0.6767