In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import librosa
import nltk
from nltk.corpus import stopwords

In [2]:
# Load the movie_lines.tsv file into a pandas dataframe
data = pd.read_csv('movie_lines.tsv', sep='\t', encoding='ISO-8859-1', error_bad_lines=False)



  data = pd.read_csv('movie_lines.tsv', sep='\t', encoding='ISO-8859-1', error_bad_lines=False)
Skipping line 32288: expected 5 fields, saw 7
Skipping line 32351: expected 5 fields, saw 6
Skipping line 32390: expected 5 fields, saw 6
Skipping line 32583: expected 5 fields, saw 6
Skipping line 32585: expected 5 fields, saw 6
Skipping line 35684: expected 5 fields, saw 6
Skipping line 62132: expected 5 fields, saw 6
Skipping line 86637: expected 5 fields, saw 6
Skipping line 86722: expected 5 fields, saw 6
Skipping line 86914: expected 5 fields, saw 6
Skipping line 86960: expected 5 fields, saw 6
Skipping line 87010: expected 5 fields, saw 6
Skipping line 87011: expected 5 fields, saw 6
Skipping line 87086: expected 5 fields, saw 6
Skipping line 120607: expected 5 fields, saw 6
Skipping line 120719: expected 5 fields, saw 7
Skipping line 120739: expected 5 fields, saw 6
Skipping line 120783: expected 5 fields, saw 6
Skipping line 130284: expected 5 fields, saw 7
Skipping line 131048: e

In [3]:
data.columns = ['lineID', 'characterID', 'movieID', 'character', 'text']

In [4]:
# Remove missing values
data.dropna(inplace=True)

In [5]:
# Create a dictionary to map each label to a number
labels = ['not hate', 'hate']
label_dict = {}
for index, label in enumerate(labels):
    label_dict[label] = index

In [6]:
# Map the labels to numbers
data['label'] = data['text'].apply(lambda x: label_dict['hate'] if 'hate' in x.lower() else label_dict['not hate'])

In [7]:
# Split the dataset into training and testing sets
train_size = int(len(data) * 0.8)
train_df = data[:train_size]
test_df = data[train_size:]

# Set the maximum number of words and sequence length
max_words = 10000
max_len = 200

In [8]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['text'])
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

In [9]:
# Pad the sequences to the same length
train_data = pad_sequences(train_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)

In [10]:
# Define the NLP model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.Conv1D(32, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [11]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_df['label'], epochs=1, validation_data=(test_data, test_df['label']))

# Test the model
loss, accuracy = model.evaluate(test_data, test_df['label'])
print('Test accuracy:', accuracy)

Test accuracy: 0.9999306797981262


In [12]:
model.save('speech_censorship_model.h5')

In [13]:
X_train= train_data
y_train= train_df['label']
X_test= test_data
y_test= test_df['label']

In [14]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix


In [15]:
# Make predictions on test set
y_pred = model.predict(X_test)



In [16]:
# Convert continuous targets to binary targets
threshold = 0.5
y_test_binary = np.array([1 if p > threshold else 0 for p in y_test])
y_pred_binary = np.array([1 if p > threshold else 0 for p in y_pred])

In [17]:
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
tn, fp, fn, tp = confusion_matrix(y_test_binary, y_pred_binary).ravel()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

In [18]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1-score:", f1)
print("True positive rate (TPR):", tpr)
print("False positive rate (FPR):", fpr)
print("False negative rate (FNR):", fnr)

Accuracy: 0.9999307047328667
Precision: 1.0
F1-score: 0.9945652173913043
True positive rate (TPR): 0.9891891891891892
False positive rate (FPR): 0.0
False negative rate (FNR): 0.010810810810810811


In [20]:
sentence = "I HATE YOU"

# Tokenize the sentence
sentence_sequence = tokenizer.texts_to_sequences([sentence])

# Pad the sequence to the same length as the training data
padded_sequence = pad_sequences(sentence_sequence, maxlen=max_len)

# Predict the label of the sentence
predicted_label = model.predict(padded_sequence)[0][0]
if predicted_label > threshold:
    print("Hate speech")
else:
    print("Not hate speech")

Hate speech


In [22]:
sentence = "I can't stand spicy food."

# Tokenize the sentence
sentence_sequence = tokenizer.texts_to_sequences([sentence])

# Pad the sequence to the same length as the training data
padded_sequence = pad_sequences(sentence_sequence, maxlen=max_len)

# Predict the label of the sentence
predicted_label = model.predict(padded_sequence)[0][0]
if predicted_label > threshold:
    print("Hate speech")
else:
    print("Not hate speech")

Not hate speech


In [26]:
sentence = "I hate going to the dentist."

# Tokenize the sentence
sentence_sequence = tokenizer.texts_to_sequences([sentence])

# Pad the sequence to the same length as the training data
padded_sequence = pad_sequences(sentence_sequence, maxlen=max_len)

# Predict the label of the sentence
predicted_label = model.predict(padded_sequence)[0][0]
if predicted_label > threshold:
    print("Hate speech")
else:
    print("Not hate speech")

Hate speech
