In [None]:
!pip install bert-embedding

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
ethos_path = '/content/drive/MyDrive/Minor Project/ETHOS_binary.csv'

# Load Dataset

In [None]:
df = pd.read_csv(ethos_path)
df.head()

Unnamed: 0,text,label
0,You should know women's sports are a joke,1
1,You look like Sloth with deeper Down’s syndrome,1
2,You look like Russian and speak like Indian. B...,1
3,"Women deserve to be abused, I guess.",1
4,Women are made for making babies and cooking d...,1


In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def clean_text(txt):
  txt = txt.lower() #lowercase
  txt = re.sub(r"[^a-zA-Z0-9' ]", ' ', txt) #remove special characters
  txt = re.sub(r' +', ' ', txt) #remove extra spaces
  txt = [stemmer.stem(word) for word in word_tokenize(txt) if word not in set(stop_words)]
  # txt = [lemmatizer.lemmatize(word) for word in word_tokenize(txt) if word not in set(stop_words)]
  txt = ' '.join(txt)
  return txt

In [None]:
df['cleaned_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label,cleaned_text
0,You should know women's sports are a joke,1,know women 's sport joke
1,You look like Sloth with deeper Down’s syndrome,1,look like sloth deeper syndrom
2,You look like Russian and speak like Indian. B...,1,look like russian speak like indian disgust go...
3,"Women deserve to be abused, I guess.",1,women deserv abus guess
4,Women are made for making babies and cooking d...,1,women made make babi cook dinner noth els


In [None]:
mx = 0
bigtext = ""
for txt in df['cleaned_text']:
  if len(txt.split(' ')) > mx:
    mx = len(txt.split(' '))
    bigtext = txt

bigtext

'need nazi parti return get anoth hitler form ss start ethnic cleans muslim ye ye gather put concentr camp put muslim refuge ship send back thier countri want live thier way thier ridicul law assimil new western countri tri dictat us even remov jesu school public area fuck seriou fuck dirti smell muslim stop dictat cant countri peopl power must rise say enough enough take sweden back swede take greec back greek take germani back take poland back take denmark back save countri take europ back introduc ss style polic clean street ghetto full dirti muslim govern fault allow th shit happen send militari send brutal polic start clean countri europ muslim invad europ allow happen ancient time even 100 year ago islam invad europ would war bring back crusad ss style nazi polic must unit stand protect democrat countri law way elimin islam throw dirti smell muslim anim send back countri live wish fuck goat rape women fuck disgust muslim low life dirti race islam nit god religion cult ridicul une

# Sentence to Vector 

In [None]:
class TfidfEmbedding():
  def __init__(self, ngram_range = (1,3), max_features = 1000):
    self.tfidf = TfidfVectorizer(ngram_range = ngram_range, max_features = max_features)
    
  def CreateSentenceEmbeddings(self, corpus, train = True):
    if train == True:
      embedded_vector = self.tfidf.fit_transform(corpus)
    else:
      embedded_vector = self.tfidf.transform(corpus)
    return embedded_vector

In [None]:
encoder = TfidfEmbedding() #initialize encoder

#splitting training and testing set
train_corpus, test_corpus, train_labels, test_labels = train_test_split(df['cleaned_text'], np.array(df['label']), test_size=0.33, random_state=0)
train_corpus = encoder.CreateSentenceEmbeddings(train_corpus, train=True)
test_corpus = encoder.CreateSentenceEmbeddings(test_corpus, train = False)
print(train_corpus.shape, test_corpus.shape)
print(train_labels.shape, test_labels.shape)

(668, 1000) (330, 1000)
(668,) (330,)


# Model Training

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
def accuracy(y, y_hat):
  assert(y.shape == y_hat.shape) #check if same shape
  
  #find tp, tn, fp, fn
  tp = ((y==1) & (y_hat==1)).sum()
  tn = ((y==0) & (y_hat==0)).sum()
  fp = ((y==0) & (y_hat==1)).sum()
  fn = ((y==1) & (y_hat==0)).sum()
  epsilon = 1e-5

  def accuracy():
    return (tp + tn)/(y.shape[0] + epsilon) 

  def precision():
    return tp/(tp + fp + epsilon)
  
  def recall():
    return tp/(tp + fn + epsilon)

  def f1():
    p = precision()
    r = recall()
    return 2*p*r/(p + r)

  return {
      'accuracy': accuracy(),
      'precision': precision(),
      'recall': recall(),
      'f1-score': f1()
  }

In [None]:
#SVM
class SVM():
  def __init__(self):
    self.svm = SVC(random_state = 0)

  def train(self, X, y):
      self.svm.fit(X, y)
  
  def predict(self, X):
      return self.svm.predict(X)

#Logistic Regression
class Logistic_Regression():
  def __init__(self):
    self.clf = LogisticRegression(random_state = 0)

  def train(self, X, y):
    self.clf.fit(X,y)
  
  def predict(self, X):
    return self.clf.predict(X)

#Random Forest 
class Random_Forest():
  def __init__(self):
    self.clf = RandomForestClassifier(random_state = 0)

  def train(self, X, y):
    self.clf.fit(X,y)

  def predict(self, X):
    return self.clf.predict(X)

#Gradient Boost
class Gradient_Boost():
  def __init__(self):
    self.clf = GradientBoostingClassifier(random_state = 0)

  def train(self, X, y):
    self.clf.fit(X,y)

  def predict(self, X):
    return self.clf.predict(X)


In [None]:
#initialize model
# model = SVM()
# model = Logistic_Regression() 
# model = Random_Forest()
model = Gradient_Boost()

In [None]:
model.train(train_corpus, train_labels)

In [None]:
y_hat = model.predict(test_corpus)
accuracy(test_labels, y_hat)

{'accuracy': 0.6424242229568418,
 'precision': 0.5909090371900875,
 'recall': 0.47101445862214064,
 'f1-score': 0.524193506113427}