Import packages and dataset

In [1]:
import pandas as pd

# Load your dataset
data = pd.read_csv('labeled_data.csv')

pd.set_option('display.max_columns', None)

#display the first few rows of tweet row
print(data.head())

   class                                              tweet
0      2  !!! RT @mayasolovely: As a woman you shouldn't...
1      1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2      1  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3      1  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4      1  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


Clean Text

In [None]:
import re
import string

def clean_text(text):
    # Convert text to lowercase
    text = str(text).lower()

    # Remove text inside square brackets
    text = re.sub('\[.*?\]', '', text)

    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)

    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove newline characters
    text = re.sub('\n', '', text)

    # Remove words containing digits
    text = re.sub('\w*\d\w*', '', text)

    return text

# Apply the clean_text function to the 'tweet' column
data['clean_text'] = data['tweet'].apply(clean_text)

#display the first few rows of tweet row
print(data[['clean_text']])

n-grams

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Download NLTK data files (necessary for first-time use)
nltk.download('punkt')

# Function to tokenize text into n-grams of given size
def tokenize_text_ngrams(text, n=1): #change n to desired grams
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(gram) for gram in ngrams_list]

# Apply tokenization to the 'clean_text' column
data['tokens'] = data['clean_text'].apply(tokenize_text_ngrams)

# Display the first few rows with tokenized n-grams
print(data[['tokens']].head())

Filtering stop words

In [None]:
from nltk.corpus import stopwords

# Download stopwords from NLTK
nltk.download('stopwords')

# Add 'RT' to list of stop words
stop_words = set(stopwords.words('english'))
stop_words.add('rt')
stop_words.add('im')
stop_words.add('like')
stop_words.add('dont')
stop_words.add('got')
stop_words.add('get')
stop_words.add('u')
stop_words.add('aint')

# Function to filter stop words from tokens
def filter_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Apply stop word filtering to the 'tokens' column
data['filtered_tokens'] = data['tokens'].apply(filter_stopwords)

# Display the first few rows with filtered tokens
data[['filtered_tokens']].head()

Visualize

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Flatten the list of all words to get word frequencies
all_words = [word for tokens in data['filtered_tokens'] for word in tokens]
word_freq = Counter(all_words)

# Get the top 10 most common words
most_common_words = word_freq.most_common(10)

# Create bar plot for the top 10 words
words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='blue')
plt.title('Top 10 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Most Common Words')
plt.show()

Lemmatize

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply the lemmatization function to the tokenized tweets
data['text'] = data['filtered_tokens'].apply(lemmatize_tokens)

#rename columns
data.rename(columns={'class': 'label'}, inplace=True)

# Select only the 'class' and 'lemmatized_tokens' columns
df = data[['label', 'text']]

# Save the DataFrame to a CSV file
df.to_csv('df.csv', index=False)

# Display sorted DataFrame
print(df)

LIWC Incel Violent Extremism

In [2]:
import subprocess

inputFileCSV = r"C:\Users\Gigabyte\OneDrive\Documents\Jobs\NTU\DETER\Models\Hate Speech\df.csv"
outputLocation = r"C:\Users\Gigabyte\OneDrive\Documents\Jobs\NTU\DETER\Models\Hate Speech\LIWC\Output\Incel_Violent_Extremism_Output.csv"

liwcDict = r"C:\Users\Gigabyte\OneDrive\Documents\Jobs\NTU\DETER\Models\Hate Speech\LIWC\Dictionary\incel-violent-extremism-dictionary.dicx"

cmd_to_execute = ["LIWC-22-cli",
                  "--mode", "wc",
                  "--input", inputFileCSV,
                  "--dictionary", liwcDict,
                  "--row-id-indices", "1",
                  "--column-indices", "2",
                  "--output", outputLocation]

# Let's go ahead and run this analysis:
subprocess.call(cmd_to_execute)

# Load your dataset
Dict1 = pd.read_csv(r"C:\Users\Gigabyte\OneDrive\Documents\Jobs\NTU\DETER\Models\Hate Speech\LIWC\Output\Incel_Violent_Extremism_Output.csv")

# Select only the 'IVED' column
print(Dict1[['IVED']])

        IVED
0       0.00
1      11.11
2      11.11
3      25.00
4      10.00
...      ...
24769   0.00
24770   0.00
24771   0.00
24772  20.00
24773   0.00

[24774 rows x 1 columns]


Compiling

In [None]:
# Example function to undo n-grams
def undo_ngrams(ngrams_list):
    return ' '.join([str(gram) for gram in ngrams_list])

# Apply undo_ngrams function using .loc to avoid SettingWithCopyWarning
df.loc[:, 'text'] = df['text'].apply(undo_ngrams)

# Concatenate DataFrames
final_df = pd.concat([df[['text']], Dict1[['IVED']], df[['label']]], axis=1)

# Replace NaN values
final_df.fillna({'text': '', 'IVED': 0}, inplace=True)  # Replace NaN in 'text' with '' and in 'IVED' with 0

#if we're using TPOT
#df[['text']]

# Take a sample of 10% of the data
#final_df.sample(frac=0.1)

# Display the concatenated DataFrame
#print(final_df_sample)

# Display the concatenated DataFrame
print(final_df)

# Class count
count_class_0, count_class_1, count_class_2 = final_df['label'].value_counts()

# display label count
count_class_0, count_class_1, count_class_2 

Classification with TF-IDF & 
Ensembling: Random Forest, Gradient boosting machines, Neural Network

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import BaggingClassifier, VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Ensure text data is in DataFrame format
X = final_df[['text', 'IVED']]  # Convert to DataFrame to ensure 2D structure
y = final_df['label']

undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Define TF-IDF vectorizer with unigrams only
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Column transformer to apply TF-IDF only to the 'text' column
preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidf_vectorizer, 'text'),
        ('other', 'passthrough', ['IVED'])  # Keep other columns as they are
    ]
)

# Define base models for ensemble
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

# Define Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('gb', gb_model),
    ('mlp', mlp_model)
], voting='hard')

# Define the pipeline with preprocessor and Voting Classifier
pipeline_voting = Pipeline([
    ('preprocessor', preprocessor),
    ('voting', voting_classifier)
])

# Evaluate the model using RepeatedStratifiedKFold
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
#cross_val_scores = cross_val_score(pipeline_voting, X_train, y_train, cv=cv, scoring='accuracy')
#print(f"Cross-validated accuracy: {cross_val_scores.mean():.4f} (+/- {cross_val_scores.std():.4f})")

# Fit the pipeline on training data
pipeline_voting.fit(X_train, y_train)

# Save the model to disk
filename = 'finalized_model.sav'
joblib.dump(pipeline_voting, filename)

# Predictions using Voting Classifier
y_pred_voting = pipeline_voting.predict(X_test)

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Voting Model Accuracy: {accuracy_voting:.4f}")

print("\nVoting Model Classification Report:")
print(classification_report(y_test, y_pred_voting, zero_division=0))

print("\nVoting Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))

Prediction

In [None]:
import pandas as pd
import joblib
import subprocess
import json
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Load the saved model
filename = 'finalized_model.sav'
pipeline_voting = joblib.load(filename)

# Function to clean and preprocess new text data
def preprocess_new_data(new_texts, liwc_output):
    # Cleaning function
    def clean_text(text):
        text = str(text).lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text

    # Apply cleaning
    cleaned_texts = [clean_text(text) for text in new_texts]

    # Tokenize text
    nltk.download('punkt')
    tokenized_texts = [word_tokenize(text) for text in cleaned_texts]

    # Filter stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['rt', 'im', 'like', 'dont', 'got', 'get', 'u', 'aint'])
    filtered_texts = [[word for word in tokens if word not in stop_words] for tokens in tokenized_texts]

    # Lemmatize tokens
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    lemmatizer = WordNetLemmatizer()
    lemmatized_texts = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in filtered_texts]

    # Undo n-grams
    def undo_ngrams(ngrams_list):
        return ' '.join([str(gram) for gram in ngrams_list])

    preprocessed_texts = [undo_ngrams(tokens) for tokens in lemmatized_texts]

    # Create DataFrame with 'text' and 'IVED' columns
    new_data = pd.DataFrame({'text': preprocessed_texts, 'IVED': liwc_output})

    return new_data

# Interactive prompt for input text
input_text = 'how are you?'

# Example new data
new_texts = [input_text]

# LIWC dictionary location
liwc_dict = r"C:\Users\Gigabyte\OneDrive\Documents\Jobs\NTU\DETER\Models\Hate Speech\LIWC\Dictionary\incel-violent-extremism-dictionary.dicx"

# Command to execute LIWC analysis
cmd_to_execute = ["LIWC-22-cli",
                  "--mode", "wc",
                  "--input", "console",
                  "--dictionary", liwc_dict,
                  "--console-text", input_text,
                  "--output", "console"]

# Run the LIWC analysis and parse the result
results = subprocess.check_output(cmd_to_execute, shell=True).strip().splitlines()
results_json = json.loads(results[1])
liwc_output = [results_json['IVED']]  # Replace with actual IVED value

# Preprocess the new data
new_data = preprocess_new_data(new_texts, liwc_output)

# Predict using the loaded model
predictions = pipeline_voting.predict(new_data)

# Display the predictions
labels = {0: "Hate Speech", 1: "Offensive Language", 2: "Neither"}
predicted_label = labels[predictions[0]]

print("Predictions:", predicted_label)
print(liwc_output)

TPOT

In [None]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split

# Example DataFrame (use your actual final_df)
# final_df = pd.read_csv('your_data.csv')  # Assuming final_df is read from a CSV or created previously

# summarize the dataset
# split into input and output elements
data = final_df.values
X, y = data[:, 0], data[:, -1]  # Assuming the text is in the first column and labels are in the last column

# Reshape X to make it 2D (assuming X is text data and y is labels)
X = X.reshape(-1, 1)  # This assumes each element in X is a single text string; adjust accordingly if needed

# Define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Define TPOT sparse classifier
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)

# perform the search
model.fit(X, y)

# export the best model
model.export('tpot_best_model.py')