In [104]:
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import gensim.downloader as api
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

nlp = spacy.load("en_core_web_sm")

In [105]:
data = pd.read_csv("YouTube_Datasets/preprocessed_youtube.csv")
data

Unnamed: 0,comment_text,Cleaned_Content,Preprocessed_content,Label
0,Elders react to Gaga Five Foot Two,elders react to gaga five foot two,elder react gaga foot,0
1,"Hey Anna, could you do longer cameo in Ant-Man...",hey anna could you do longer cameo in antman s...,hey anna long cameo antman sequel need act,0
2,Is it me or does that railing not seem very hi...,is it me or does that railing not seem very hi...,rail high,0
3,I think she might be a scammer who's trying to...,i think she might be a scammer whos trying to ...,think scammer s try money lawsuit,0
4,I like how he gives everyone the ol' Heil Hitler.,i like how he gives everyone the ol heil hitler,like give ol heil hitler,0
...,...,...,...,...
878,"This video was awful, Iâm sorry Katy. Your p...",this video was awful im sorry katy your past v...,video awful m sorry katy past video sicken wtf,0
879,The house caught fire from their fire musicð...,the house caught fire from their fire music,house catch fire fire music,0
880,your lips make me want to eat powdered donuts ...,your lips make me want to eat powdered donuts,lip want eat powdered donut,0
881,Yrs later andy in jail 4 abusing daughter.,yrs later andy in jail abusing daughter,yrs later andy jail abuse daughter,0


In [106]:
len(data)

883

In [90]:
hate_speech_words = [
    'hate', 'violence', 'abuse', 'offend', 'discrimination', 'racism', 'oppression', 'bigot', 'ignorant', 
    'supremacist', 'feminazi', 'cunt', 'slut', 'whore', 'bitch', 'skank', 'thot', 'ho', 'gold digger', 
    'motherfucker', 'cockroach', 'asshole', 'dickhead', 'prick', 'scumbag', 'bastard', 'fuck', 'nigga', 
    'nigger', 'chink', 'gook', 'spic', 'wetback', 'cracker', 'honky', 'kike', 'wog', 'towelhead', 
    'sandnigger', 'faggot', 'fag', 'dyke', 'tranny', 'homo', 'sissy', 'fairy', 'pansy', 'sodomite', 
    'infidel', 'heathen', 'crusader', 'raghead', 'jihadist', 'taliban', 'beaner', 'gypsy', 'paki', 
    'wop', 'dago', 'mick', 'jap', 'yid', 'kaffir', 'cholo', 'zebra', 'white trash', 'redneck', 'hillbilly',
    'karen', 'snowflake', 'beta', 'incel', 'simp', 'boomer', 'npc', 'soyboy', 'mgtow', 'swine', 'trumpist', 
    'libtard', 'nazi', 'commie', 'sjw', 'ratchet', 'ghetto', 'redpill', 'based', 'chad', 'beta male', 
    'alpha male', 'cuck', 'cock', 'broke bitch', 'pussy ass', 'turd', 'feminist', 'terrorist', 'islamist'
]

word2vec_model = api.load("word2vec-google-news-300")

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')


In [107]:
def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def detect_hate_speech(text):
    return sum([1 for word in hate_speech_words if word in text.lower()])

def pos_tagging(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

def get_word2vec_embedding(text):
    tokens = [token.text.lower() for token in nlp(text)]
    word_vectors = [word2vec_model[word] for word in tokens if word in word2vec_model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

def lexical_diversity(text):
    tokens = [token.text for token in nlp(text)]
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

def sentence_complexity(text):
    return len(re.findall(r'\.', text))

def get_sbert_embedding(text):
    return sbert_model.encode(text, convert_to_numpy=True)


In [108]:
data['Preprocessed_content'] = data['Preprocessed_content'].astype(str)

### Balance the dataset

In [109]:
class_0 = data[data['Label'] == 0].sample(n=600, random_state=42)
class_1 = data[data['Label'] == 1].sample(n=76, random_state=42)
data = pd.concat([class_0, class_1])
data.reset_index(drop=True, inplace=True)

In [110]:
data['sentiment'] = data['Preprocessed_content'].apply(sentiment_analysis)
print('Sentiment analysis done')

data['hate_speech_count'] = data['Preprocessed_content'].apply(detect_hate_speech)
print('Hate speech detection done')

data['pos_tags'] = data['Preprocessed_content'].apply(pos_tagging)
print('POS tagging detection done')

data['word2vec'] = data['Preprocessed_content'].apply(get_word2vec_embedding)
print('Word2Vec embedding done')

data['sbert_embedding'] = data['Preprocessed_content'].apply(get_sbert_embedding)
print('SBERT embedding done')

data['lexical_diversity'] = data['Preprocessed_content'].apply(lexical_diversity)
print('Lexical diversity done')

data['sentence_complexity'] = data['Preprocessed_content'].apply(sentence_complexity)
print('Sentence complexity done')

Sentiment analysis done
Hate speech detection done
POS tagging detection done
Word2Vec embedding done
SBERT embedding done
Lexical diversity done
Sentence complexity done


In [111]:
data

Unnamed: 0,comment_text,Cleaned_Content,Preprocessed_content,Label,sentiment,hate_speech_count,pos_tags,word2vec,sbert_embedding,lexical_diversity,sentence_complexity
0,Click this link to sign up for $300 http://for...,click this link to sign up for httpformoneyonl...,click link sign httpformoneyonlycomrefer,0,0.000000,0,"[VERB, PROPN, NOUN, PROPN]","[-0.074788414, 0.022460938, -0.17325847, 0.134...","[-0.04272761, -0.061473675, -0.019803623, 0.00...",1.000000,0
1,Have you tried turning it off and on?,have you tried turning it off and on,try turn,0,0.000000,0,"[VERB, NOUN]","[0.08862305, 0.15405273, 0.078308105, 0.145263...","[0.02100541, -0.040340815, -0.03201399, -0.004...",1.000000,0
2,If u saying Jordan one of the best u ainât i...,if u saying jordan one of the best u aint in t...,u say jordan good u be not good ur statement s...,0,0.183333,1,"[PRON, VERB, PROPN, PROPN, PROPN, VERB, PART, ...","[-0.049316406, 0.03329827, 0.05125517, 0.07881...","[0.064440705, 0.04145527, -0.045367703, 0.0012...",0.888889,0
3,Unexpected turn of events: dude is Asian,unexpected turn of events dude is asian,unexpected turn event dude asian,0,0.050000,0,"[ADJ, NOUN, NOUN, VERB, PROPN]","[-0.020629883, 0.0014404297, 0.018041993, 0.16...","[-0.037429538, 0.09296043, 0.030556811, 0.0632...",1.000000,0
4,Made me cry...and that's a good thing...God bless,made me cryand thats a good thinggod bless,cryand s good thinggod bless,0,0.700000,0,"[PROPN, PART, ADJ, PROPN, VERB]","[-0.021647135, 0.14908855, -0.023803711, 0.171...","[-0.15125263, 0.063120745, 0.071638584, -0.044...",1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...
671,11:43 those are stripper heels..is Â Cristine ...,those are stripper heelsis cristine is a retir...,stripper heelsis cristine retire hoe,1,0.000000,1,"[ADJ, NOUN, NOUN, VERB, NOUN]","[0.052083332, -0.10514323, -0.038950603, 0.192...","[-0.06953118, 0.054789927, -0.025015866, -0.07...",1.000000,0
672,What a shit weak freestyle\n\nhttps://youtu.be...,what a shit weak freestylennhttpsyoutubeclslnl...,shit weak freestylennhttpsyoutubeclslnlzaati a...,1,-0.287500,0,"[PROPN, ADJ, PROPN, INTJ, NOUN, VERB, PROPN, P...","[0.026530266, -0.0042800903, -0.0074310303, 0....","[0.03838325, -0.107888035, 0.05925496, -0.0388...",1.000000,0
673,Ohh black nigaa give away legit? Lol,ohh black nigaa give away legit lol,ohh black nigaa away legit lol,1,0.316667,0,"[ADJ, ADJ, NOUN, ADV, NOUN, NOUN]","[-0.049121093, 0.016284179, 0.105603024, 0.153...","[-0.13155076, 0.07625024, -0.10333354, 0.04866...",1.000000,0
674,Y'all are a bunch of racist. When you like som...,yall are a bunch of racist when you like someo...,you bunch racist like label racist,1,0.000000,0,"[PRON, VERB, NOUN, ADP, NOUN, NOUN]","[0.017089844, 0.018656412, 0.119010925, 0.1214...","[0.014129858, 0.05866641, -0.13190544, 0.02975...",0.833333,0


### Save the dataset with extracted features

In [112]:
data = data.dropna()

In [113]:
data.to_csv("YouTube_Datasets/features_youtube.csv", index=False)