# NLP Toxcicty Machine Learning Model

## Imports

In [2]:
import pandas as pd
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.multioutput import MultiOutputClassifier
import re

## Data

In [4]:
df_train= pd.read_csv('train.csv', index_col = 'id')

EmptyDataError: No columns to parse from file

In [5]:
df_train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159571 entries, 0000997932d777bf to fff46fc426af1f9a
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   159571 non-null  object
 1   toxic          159571 non-null  int64 
 2   severe_toxic   159571 non-null  int64 
 3   obscene        159571 non-null  int64 
 4   threat         159571 non-null  int64 
 5   insult         159571 non-null  int64 
 6   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 9.7+ MB


In [71]:
df_test = pd.read_csv('data/test_labels.csv')

In [72]:
df_test.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [9]:
X = df_train.comment_text
y = df_train[['toxic', 'severe_toxic', 'obscene', 'threat','identity_hate']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=['toxic', 'severe_toxic','identity_hate'])
y_toxic_train = df_train.toxic
y_severe_toxic_train = df_train.severe_toxic
y_obscene_train = df_train.obscene
y_threat_train = df_train.threat
y_insult_train = df_train['identity_hate']

In [41]:
sss = StratifiedShuffleSplit(random_state = 42, train_size = .8)

In [42]:
sss.get_n_splits(X,y)

10

In [43]:
print(sss)

StratifiedShuffleSplit(n_splits=10, random_state=42, test_size=None,
            train_size=0.8)


In [44]:
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [12]:
X_train.head()

id
c912439967ba8a35    ,  I have added the newline sign back in, you ...
b312f612d3394d5b    Don't worry, I think I fixed it. But still loo...
813c210bf7f27377    "\nWell it seems to me that you have a problem...
c231999bc75dcd9e    Contents\nThe contents box just appears when t...
d60a1de68cf593c1    " \n\n(Btw., did you know the article links to...
Name: comment_text, dtype: object

In [15]:
y_train.value_counts()

toxic  severe_toxic  obscene  threat  identity_hate
0      0             0        0       0                107725
1      0             0        0       0                  5202
                     1        0       0                  4142
       1             1        0       0                   855
       0             1        0       1                   488
0      0             1        0       0                   378
1      0             0        0       1                   201
       1             1        0       1                   196
       0             1        1       0                   105
                     0        1       0                   100
0      0             0        0       1                    61
1      1             1        1       0                    56
       0             1        1       1                    43
       1             0        0       0                    36
                     1        1       1                    28
0      0          

In [16]:
# Function for improving parts of speech information

### get_wordnet_pos was taken from Lecture 51-nlp_modeling.ipynb 
### link to the lecture: https://github.com/dvdhartsman/NTL-DS-080723/blob/main/4phase/51-nlp_modeling.ipynb


def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
# Function for handling the transformation of data

### preprocess taken from nlp-sentiment-analysis
### link to the project: https://github.com/dvdhartsman/NLP-Sentiment-Analysis/blob/main/Text_Classification_Final_Notebook.ipynb

def preprocess(tweet):
    """
    This is a function that is intended to handle all of the tokenization, lemmatization, and other
    preprocessing for our tweet data. It will make use of objects from other libraries, and will return
    a complete list of tokens that are ready to be vectorized into numerical data.
    """
    
    # Create a list of stopwords to be removed from our tokenized word list
    stops = stopwords.words("english")
    # Add punctuation to the list of stopwords
    stops += string.punctuation
    # Providing a regex pattern for the tokenizer to handle
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    # Instantiating a tokenizer
    tokenizer = RegexpTokenizer(pattern)
    # Creating a list of raw tokens
    raw_tokens = tokenizer.tokenize(tweet)
    # Using a comprehension to lower case every token
    lower_tokens = [i.lower() for i in raw_tokens]
    # Remove the stopwords from the list of tokens
    stopped_words = [i for i in lower_tokens if i not in stops]
    
    # Adding parts of speech to prepare for Lemmatization
    
    # This is the initial method to get parts of speech
    stopped_words = pos_tag(stopped_words)
    
    # Get_wordnet_pos() is the function to modify the pos definitions/assignments, creates tuples of (<word>, <pos>)
    stopped_words = [(word[0], get_wordnet_pos(word[1])) for word in stopped_words]
    
    lemmatizer = WordNetLemmatizer() 
    
    # This corrects the parts of speech and maximizes the usefulness of the lemmatization!!!!!
    document = [lemmatizer.lemmatize(word[0], word[1]) for word in stopped_words]
    
    # Re-join the list of cleaned tokens
    cleaned_doc = " ".join(document)
    return cleaned_doc

In [78]:
#X_train.apply(preprocess)

"""id
0000997932d777bf    explanation edits make username hardcore metal...
000103f0d9cfb60f    d'aww match background colour i'm seemingly st...
000113f07ec002fd    hey man i'm really try edit war guy constantly...
0001b41b1c6bb37e    can't make real suggestion improvement wonder ...
0001d958c54c6e35                 sir hero chance remember page that's
                                          ...                        
ffe987279560d7ff    second time ask view completely contradict cov...
ffea4adeee384e90                 ashamed horrible thing put talk page
ffee36eab5c267c9    spitzer umm there actual article prostitution ...
fff125370e4aaaf3    look like actually put speedy first version de...
fff46fc426af1f9a    really think understand come idea bad right aw...
Name: comment_text, Length: 159571, dtype: object"""

id
0000997932d777bf    explanation edits make username hardcore metal...
000103f0d9cfb60f    d'aww match background colour i'm seemingly st...
000113f07ec002fd    hey man i'm really try edit war guy constantly...
0001b41b1c6bb37e    can't make real suggestion improvement wonder ...
0001d958c54c6e35                 sir hero chance remember page that's
                                          ...                        
ffe987279560d7ff    second time ask view completely contradict cov...
ffea4adeee384e90                 ashamed horrible thing put talk page
ffee36eab5c267c9    spitzer umm there actual article prostitution ...
fff125370e4aaaf3    look like actually put speedy first version de...
fff46fc426af1f9a    really think understand come idea bad right aw...
Name: comment_text, Length: 159571, dtype: object

In [18]:
X_train_clean = X_train.apply(preprocess)

In [19]:
X_test_clean = X_test.apply(preprocess)

In [22]:
dummy = DummyClassifier(strategy='most_frequent')

In [23]:
dummy.fit(X_train,y_train)

In [27]:
classification_report(y_train,dummy.predict(X_train))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00     11479\n           1       0.00      0.00      0.00      1189\n           2       0.00      0.00      0.00      6306\n           3       0.00      0.00      0.00       373\n           4       0.00      0.00      0.00      1048\n\n   micro avg       0.00      0.00      0.00     20395\n   macro avg       0.00      0.00      0.00     20395\nweighted avg       0.00      0.00      0.00     20395\n samples avg       0.00      0.00      0.00     20395\n'