In [4]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import string
import pickle
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
hate_dataset = pd.read_csv('/content/Data/2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv.zip', usecols=[2,3], names=['text', 'hate'], skiprows=1)
hate_dataset

Unnamed: 0,text,hate
0,It should be legal for a husband to rape his wife,hate
1,dalits are lowlife scum,hate
2,dalits are lowlives,hate
3,It was a better world when women dare not ques...,hate
4,I don't work this hard so that those immigrant...,hate
...,...,...
40618,we should forcibly remove all the foreigners f...,hate
40619,we should forcibly remove all the immigrants f...,hate
40620,dalits are unworthy of the Gods love,hate
40621,dalits are worthless,hate


In [109]:
hate_dataset_two = pd.read_csv('/content/Data/labeled_data.csv.zip', usecols=[5,6], names=['hate', 'text'], skiprows=1)
hate_dataset_two


Unnamed: 0,hate,text
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [113]:
hate_dataset_two['hate'].replace([0,1,2], [1,1,0], inplace=True)
hate_dataset_two.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   hate    24783 non-null  int64 
 1   text    24783 non-null  object
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


In [114]:
hate_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40623 entries, 0 to 40622
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   text                     40623 non-null  object
 1   hate                     40623 non-null  int64 
 2   Cleaned_text             40623 non-null  object
 3   Tokenized_text           40623 non-null  object
 4   WithoutStop_text         40623 non-null  object
 5   WithoutShorttokens_text  40623 non-null  object
 6   Stemmed_text             40623 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.2+ MB


In [7]:
hate_dataset['hate'].replace(['nothate','hate'], [0,1], inplace=True)
hate_dataset

Unnamed: 0,text,hate
0,It should be legal for a husband to rape his wife,1
1,dalits are lowlife scum,1
2,dalits are lowlives,1
3,It was a better world when women dare not ques...,1
4,I don't work this hard so that those immigrant...,1
...,...,...
40618,we should forcibly remove all the foreigners f...,1
40619,we should forcibly remove all the immigrants f...,1
40620,dalits are unworthy of the Gods love,1
40621,dalits are worthless,1


In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [117]:
def remove_punctuation(text):
  cleaned = ''.join([word for word in text if word not in string.punctuation])
  return cleaned

hate_dataset['Cleaned_text'] = hate_dataset['text'].apply(lambda x: remove_punctuation(x))
hate_dataset_two['Cleaned_text'] = hate_dataset_two['text'].apply(lambda x: remove_punctuation(x))

In [119]:
def tokenizer(text):
  #remove uupercase
  lower_text = text.lower()
  #Tokenize
  tokenized_text = nltk.word_tokenize(lower_text)
  return tokenized_text

hate_dataset['Tokenized_text'] = hate_dataset['Cleaned_text'].apply(lambda x: tokenizer(x))
hate_dataset_two['Tokenized_text'] = hate_dataset_two['Cleaned_text'].apply(lambda x: tokenizer(x))
hate_dataset_two

Unnamed: 0,hate,text,Cleaned_text,Tokenized_text
0,1,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compl...,"[rt, mayasolovely, as, a, woman, you, shouldnt..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuff...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a..."
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me ...,"[rt, shenikaroberts, the, shit, you, hear, abo..."
...,...,...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,yous a muthafin lie 8220LifeAsKing 20Pearls co...,"[yous, a, muthafin, lie, 8220lifeasking, 20pea..."
24779,1,"you've gone and broke the wrong heart baby, an...",youve gone and broke the wrong heart baby and ...,"[youve, gone, and, broke, the, wrong, heart, b..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like I aint fu...,"[young, buck, wan, na, eat, dat, nigguh, like,..."
24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]"


In [11]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [131]:
def remove_stopwords(text):
  without_stopwords = [word for word in text if word not in stopwords]
  return  without_stopwords

hate_dataset['WithoutStop_text'] = hate_dataset['Tokenized_text'].apply(lambda x: remove_stopwords(x))
hate_dataset_two['WithoutStop_text'] = hate_dataset_two['Tokenized_text'].apply(lambda x: remove_stopwords(x))
hate_dataset_two

Unnamed: 0,hate,text,Cleaned_text,Tokenized_text,WithoutStop_text
0,1,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compl...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuff...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me ...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ..."
...,...,...,...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,yous a muthafin lie 8220LifeAsKing 20Pearls co...,"[yous, a, muthafin, lie, 8220lifeasking, 20pea...","[yous, muthafin, lie, 8220lifeasking, 20pearls..."
24779,1,"you've gone and broke the wrong heart baby, an...",youve gone and broke the wrong heart baby and ...,"[youve, gone, and, broke, the, wrong, heart, b...","[youve, gone, broke, wrong, heart, baby, drove..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like I aint fu...,"[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, na, eat, dat, nigguh, like,..."
24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]","[youu, got, wild, bitches, tellin, lies]"


In [128]:
hate_dataset

Unnamed: 0,text,hate,Cleaned_text,Tokenized_text,WithoutStop_text,WithoutShorttokens_text,Stemmed_text
0,It should be legal for a husband to rape his wife,1,It should be legal for a husband to rape his wife,"[it, should, be, legal, for, a, husband, to, r...","[rt, mayasolovely, woman, shouldnt, complain, ...","[legal, husband, rape, wife]","[legal, husband, rape, wife]"
1,dalits are lowlife scum,1,dalits are lowlife scum,"[dalits, are, lowlife, scum]","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c...","[dalits, lowlife, scum]","[dalit, lowlif, scum]"
2,dalits are lowlives,1,dalits are lowlives,"[dalits, are, lowlives]","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[dalits, lowlives]","[dalit, lowliv]"
3,It was a better world when women dare not ques...,1,It was a better world when women dare not ques...,"[it, was, a, better, world, when, women, dare,...","[rt, cganderson, vivabased, look, like, tranny]","[better, world, women, dare, question, men]","[better, world, women, dare, question, men]"
4,I don't work this hard so that those immigrant...,1,I dont work this hard so that those immigrants...,"[i, dont, work, this, hard, so, that, those, i...","[rt, shenikaroberts, shit, hear, might, true, ...","[dont, work, hard, immigrants, take, benefits]","[dont, work, hard, immigr, take, benefit]"
...,...,...,...,...,...,...,...
40618,we should forcibly remove all the foreigners f...,1,we should forcibly remove all the foreigners f...,"[we, should, forcibly, remove, all, the, forei...",,"[forcibly, remove, foreigners]","[forcibl, remov, foreign]"
40619,we should forcibly remove all the immigrants f...,1,we should forcibly remove all the immigrants f...,"[we, should, forcibly, remove, all, the, immig...",,"[forcibly, remove, immigrants]","[forcibl, remov, immigr]"
40620,dalits are unworthy of the Gods love,1,dalits are unworthy of the Gods love,"[dalits, are, unworthy, of, the, gods, love]",,"[dalits, unworthy, gods, love]","[dalit, unworthi, god, love]"
40621,dalits are worthless,1,dalits are worthless,"[dalits, are, worthless]",,"[dalits, worthless]","[dalit, worthless]"


In [132]:
def remove_shorttokens(text):
  without_shorttokens = [word for word in text if len(word)>2]
  return  without_shorttokens

hate_dataset['WithoutShorttokens_text'] = hate_dataset['WithoutStop_text'].apply(lambda x: remove_shorttokens(x))
hate_dataset_two['WithoutShorttokens_text'] = hate_dataset_two['WithoutStop_text'].apply(lambda x: remove_shorttokens(x))
hate_dataset_two

Unnamed: 0,hate,text,Cleaned_text,Tokenized_text,WithoutStop_text,WithoutShorttokens_text
0,1,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compl...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ...","[mayasolovely, woman, shouldnt, complain, clea..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuff...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c...","[mleew17, boy, dats, coldtyga, dwn, bad, cuffi..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[urkindofbrand, dawg, 80sbaby4life, ever, fuck..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]","[cganderson, vivabased, look, like, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me ...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ...","[shenikaroberts, shit, hear, might, true, migh..."
...,...,...,...,...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,yous a muthafin lie 8220LifeAsKing 20Pearls co...,"[yous, a, muthafin, lie, 8220lifeasking, 20pea...","[yous, muthafin, lie, 8220lifeasking, 20pearls...","[yous, muthafin, lie, 8220lifeasking, 20pearls..."
24779,1,"you've gone and broke the wrong heart baby, an...",youve gone and broke the wrong heart baby and ...,"[youve, gone, and, broke, the, wrong, heart, b...","[youve, gone, broke, wrong, heart, baby, drove...","[youve, gone, broke, wrong, heart, baby, drove..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like I aint fu...,"[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, eat, dat, nigguh, like, ain..."
24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]","[youu, got, wild, bitches, tellin, lies]","[youu, got, wild, bitches, tellin, lies]"


In [133]:
stemmer = nltk.PorterStemmer()

In [136]:
def stemming(text):
  stemmed_words = [stemmer.stem(word) for word in text]
  return stemmed_words

hate_dataset['Stemmed_text'] = hate_dataset['WithoutShorttokens_text'].apply(lambda x: stemming(x))
hate_dataset_two['Stemmed_text'] = hate_dataset_two['WithoutShorttokens_text'].apply(lambda x: stemming(x))
hate_dataset_two

Unnamed: 0,hate,text,Cleaned_text,Tokenized_text,WithoutStop_text,WithoutShorttokens_text,Stemmed_text
0,1,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compl...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ...","[mayasolovely, woman, shouldnt, complain, clea...","[mayasolov, woman, shouldnt, complain, clean, ..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuff...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c...","[mleew17, boy, dats, coldtyga, dwn, bad, cuffi...","[mleew17, boy, dat, coldtyga, dwn, bad, cuffin..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[urkindofbrand, dawg, 80sbaby4life, ever, fuck...","[urkindofbrand, dawg, 80sbaby4lif, ever, fuck,..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]","[cganderson, vivabased, look, like, tranny]","[cganderson, vivabas, look, like, tranni]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me ...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ...","[shenikaroberts, shit, hear, might, true, migh...","[shenikarobert, shit, hear, might, true, might..."
...,...,...,...,...,...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,yous a muthafin lie 8220LifeAsKing 20Pearls co...,"[yous, a, muthafin, lie, 8220lifeasking, 20pea...","[yous, muthafin, lie, 8220lifeasking, 20pearls...","[yous, muthafin, lie, 8220lifeasking, 20pearls...","[you, muthafin, lie, 8220lifeask, 20pearl, cor..."
24779,1,"you've gone and broke the wrong heart baby, an...",youve gone and broke the wrong heart baby and ...,"[youve, gone, and, broke, the, wrong, heart, b...","[youve, gone, broke, wrong, heart, baby, drove...","[youve, gone, broke, wrong, heart, baby, drove...","[youv, gone, broke, wrong, heart, babi, drove,..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like I aint fu...,"[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, eat, dat, nigguh, like, ain...","[young, buck, wan, eat, dat, nigguh, like, ain..."
24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]","[youu, got, wild, bitches, tellin, lies]","[youu, got, wild, bitches, tellin, lies]","[youu, got, wild, bitch, tellin, lie]"


In [137]:
def unique_words(df, col_name, tokenized=False):
    elements = []
    df_copied = df.copy()
    df_copied.reset_index(drop=True, inplace=True)

    for i in range(df_copied.shape[0]):
            elements = elements + df_copied.loc[i, col_name]
    return list(elements)

In [138]:
hate_text = ' '.join(unique_words(df=hate_dataset[hate_dataset['hate']==1], col_name='Stemmed_text'))
hate_text2 = ' '.join(unique_words(df=hate_dataset_two[hate_dataset_two['hate']==1], col_name='Stemmed_text'))

In [139]:
nothate_text = ' '.join(unique_words(df=hate_dataset[hate_dataset['hate']==0], col_name='Stemmed_text'))
nothate_text2 = ' '.join(unique_words(df=hate_dataset_two[hate_dataset_two['hate']==0], col_name='Stemmed_text'))

In [20]:
class RemovePunctuationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_punctuation(x))
        return X_

In [21]:
class TokenizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: tokenizer(x))
        return X_

In [22]:
class RemoveStopwordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_stopwords(x))
        return X_

In [23]:
class RemoveShortTokensTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_shorttokens(x))
        return X_

In [24]:
class StemmingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: stemming(x))
        return X_

In [25]:
class  ReturnStringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: " ".join(x))
        X_ = pd.Series(X_[self.text_column])
        return X_

In [142]:
combined_dataset = pd.concat([hate_dataset, hate_dataset_two], ignore_index=True)
combined_dataset.tail()

Unnamed: 0,text,hate,Cleaned_text,Tokenized_text,WithoutStop_text,WithoutShorttokens_text,Stemmed_text
65401,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1,yous a muthafin lie 8220LifeAsKing 20Pearls co...,"[yous, a, muthafin, lie, 8220lifeasking, 20pea...","[yous, muthafin, lie, 8220lifeasking, 20pearls...","[yous, muthafin, lie, 8220lifeasking, 20pearls...","[you, muthafin, lie, 8220lifeask, 20pearl, cor..."
65402,"you've gone and broke the wrong heart baby, an...",1,youve gone and broke the wrong heart baby and ...,"[youve, gone, and, broke, the, wrong, heart, b...","[youve, gone, broke, wrong, heart, baby, drove...","[youve, gone, broke, wrong, heart, baby, drove...","[youv, gone, broke, wrong, heart, babi, drove,..."
65403,young buck wanna eat!!.. dat nigguh like I ain...,1,young buck wanna eat dat nigguh like I aint fu...,"[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, na, eat, dat, nigguh, like,...","[young, buck, wan, eat, dat, nigguh, like, ain...","[young, buck, wan, eat, dat, nigguh, like, ain..."
65404,youu got wild bitches tellin you lies,1,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]","[youu, got, wild, bitches, tellin, lies]","[youu, got, wild, bitches, tellin, lies]","[youu, got, wild, bitch, tellin, lie]"
65405,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...,1,Ruffled Ntac Eileen Dahlia Beautiful color c...,"[ruffled, ntac, eileen, dahlia, beautiful, col...","[ruffled, ntac, eileen, dahlia, beautiful, col...","[ruffled, ntac, eileen, dahlia, beautiful, col...","[ruffl, ntac, eileen, dahlia, beauti, color, c..."


In [150]:
X = combined_dataset[['text']].copy()
y = combined_dataset['hate'].copy()

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.12, random_state = 2023)

In [148]:
nlp_pipeline_LogisticRegression = Pipeline(steps=[
    ('remove_punctuation', RemovePunctuationTransformer(text_column='text')),
    ('tokenize', TokenizerTransformer(text_column='text')),
    ('remove_stop_words', RemoveStopwordsTransformer(text_column='text')),
    ('remove_short_tokens', RemoveShortTokensTransformer(text_column='text')),
    ('stemmer', StemmingTransformer(text_column='text')),
    ('vektoring', ReturnStringTransformer(text_column='text')),
    ('count_vectorizer', CountVectorizer(min_df=0.0001, max_df=0.5, ngram_range=(1,3))),
    ('model', LogisticRegression())
])


nlp_pipeline_LogisticRegression.fit(X_train, y_train)

In [151]:
#LogisticRegression

# define evaluation
cv = RepeatedKFold(n_splits=8, n_repeats=2, random_state=1)

# define search space
space = dict()
space['model__solver'] = ['lbfgs', 'liblinear', 'saga']
space['model__penalty'] = ['l1', 'l2', 'elasticnet']

# define search
search = GridSearchCV(nlp_pipeline_LogisticRegression, space, scoring='accuracy', n_jobs=-1, cv=cv, verbose=2)

# execute search
result = search.fit(X_train, y_train)

# summarize result
model_LogisticRegression = result.best_estimator_

Fitting 16 folds for each of 9 candidates, totalling 144 fits


In [152]:
print(f'F1-score dla LogisticRegression: {f1_score(y_test, model_LogisticRegression.predict(X_test))}')

F1-score dla LogisticRegression: 0.8867461430575034


In [165]:
X_new = pd.DataFrame({'text': ["please die and go kill yourself as soon as possible", "this is good and bad", "you are retarded", "this was a really great demonstration", "you are indian so you should really use some deodarant you smelly pig"]})
X_new

Unnamed: 0,text
0,please die and go kill yourself as soon as pos...
1,this is good and bad
2,you are retarded
3,this was a really great demonstration
4,you are indian so you should really use some d...


In [157]:
model_LogisticRegression.predict(X_new)

array([0, 0, 1, 0, 1])

In [158]:
import joblib

# Assuming nlp_pipeline_LogisticRegression is your trained pipeline
# and you have already run nlp_pipeline_LogisticRegression.fit(X_train, y_train)

# Save the model to a file
joblib.dump(nlp_pipeline_LogisticRegression, 'combinedmodel.joblib')


['combinedmodel.joblib']

In [166]:
# Assuming your model is already trained and has a method predict_proba
# that returns probability estimates for each class, and you're working with a binary classification problem

# Get probability estimates for the positive class
y_probabilities = model_LogisticRegression.predict_proba(X_new)[:, 1]

# Set a custom threshold (e.g., 0.3)
custom_threshold = 0.5

# Apply the custom threshold to make predictions
y_custom_predictions = (y_probabilities > custom_threshold).astype(int)
print(y_custom_predictions)

[0 0 1 0 1]
