In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [9]:
df = pd.read_csv('data/train.csv')

In [11]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [12]:
np.random.seed(500)


In [16]:
# Split into Train and Test data
#(X, Y, test_size=0.3, random_state=42)
X = df.pop('comment_text')
df.pop('id')
y = df.sum(axis = 1).to_frame(name = 'toxic')
y.loc[y['toxic'] >0,'toxic'] = 1
print(X.head())
print(y.head())


KeyError: 'comment_text'

In [18]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,test_size=0.3)
Encoder = LabelEncoder()
# Make Labels
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

  y = column_or_1d(y, warn=True)


## CountVectorizer

In [19]:
# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
print("X_train_vectorized: ")
X_train_vectorized


X_train_vectorized: 


<111699x153097 sparse matrix of type '<class 'numpy.int64'>'
	with 4855587 stored elements in Compressed Sparse Row format>

In [20]:
print("X_train shape = {}".format(X_train.shape))
print("Vocabulary length = {}".format(len(vect.vocabulary_)))

X_train shape = (111699,)
Vocabulary length = 153097


In [73]:
sorted(vect.vocabulary_.items(), key=lambda x: x[1])[:20]
print(X_train_vectorized.toarray())


## Logistic Regression

In [21]:
# Train the model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_vectorized, y_train)

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))

AUC = 0.837


In [22]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model (from lowest to highest values)
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}".format(feature_names[sorted_coef_index[:-11:-1]]))



Smallest Coefs:
['weve' 'sikhs' 'nambla' 'catches' 'extensive' 'brighton' 'sniping'
 'letting' 'ground' 'semi']

Largest Coefs: 
['fuck' 'fucking' 'bitch' 'asshole' 'idiot' 'faggot' 'ass' 'shit' 'suck'
 'sucks']


## Term Frequency - Inverse Document Frequency

In [25]:
# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 15
# This means a word should have been used in at least 15 SMS 
vect = TfidfVectorizer(min_df=15).fit(X_train)

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

# let's look of some of the words gathered with this method
sorted(vect.vocabulary_.items(), key=lambda x: x[1])[10:30]

[('08', 10),
 ('084080', 11),
 ('09', 12),
 ('0px', 13),
 ('10', 14),
 ('100', 15),
 ('1000', 16),
 ('101', 17),
 ('102', 18),
 ('103', 19),
 ('104', 20),
 ('105', 21),
 ('106', 22),
 ('107', 23),
 ('108', 24),
 ('109', 25),
 ('10th', 26),
 ('11', 27),
 ('110', 28),
 ('111', 29)]

In [26]:
# how many words appear in more than 15 text messages
len(sorted(vect.vocabulary_.items(), key=lambda x: x[1]))

14307

In [27]:
# save all feature names == words in an array
feature_names = np.array(vect.get_feature_names())

#sort for the column names according to highest tfidf value in the column
sorted_tfidf_index = X_train_vectorized.toarray().max(0).argsort()

# print words with highest and lowest tfidf values
print("Smallest tfidf:\n{}\n".format(feature_names[sorted_tfidf_index[:10]]))
print("Largest tfidf: \n{}".format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['templatename' 'signified' 'banished' 'conformance' 'seashell' '084080'
 'demean' 'fair_use' 'annually' 'striving']

Largest tfidf: 
['stop' 'mean' 'am' 'faggot' 'wright' 'hey' 'faggots' 'fags' 'singer' 'hi']


In [28]:
# Train the model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_vectorized, y_train)

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))

AUC = 0.803


In [29]:
# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['thank' 'talk' 'thanks' 'please' 'if' 'redirect' 'for' 'help' 'continue'
 'welcome']

Largest Coefs: 
['fuck' 'fucking' 'shit' 'idiot' 'stupid' 'ass' 'bullshit' 'asshole'
 'crap' 'suck']


## Stemming

In [36]:
import sys
print(sys.getrecursionlimit())
sys.setrecursionlimit(5000)


1500


In [37]:
# Initializing stemmer and countvectorizer 
stemmer = nltk.PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

# define ContVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer=stemmed_words)

# Transform X_train
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

In [38]:
# Train the model with stemmed and vectorized dataset
model_stemm = LogisticRegression(max_iter=1500)
model_stemm.fit(X_train_stem_vectorized, y_train)

# Predict the transformed test documents
predictions = model_stemm.predict(stem_vectorizer.transform(X_test))

print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))

AUC = 0.843


In [39]:
# get the feature names as numpy array
feature_names = np.array(stem_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model_stemm.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['nambla' 'stress' 'mistress' 'brighton' 'mutual' 'papphas' 'belittl'
 'gregalton' 'recommend' 'weve']

Largest Coefs: 
['fuck' 'faggot' 'asshol' 'bitch' 'suck' 'idiot' 'shit' 'ass' 'bullshit'
 'cunt']




## Lemmatization

In [46]:
# Initialization
nltk.download('wordnet')
nltk.download('omw-1.4')
WNlemma = nltk.WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()
# analyzer = TfidfVectorizer(min_df=15).build_analyzer()

def lemmatize_word(doc):
    return (WNlemma.lemmatize(t) for t in analyzer(doc))

#lemm_vectorizer = CountVectorizer(analyzer=lemmatize_word)
lemm_vectorizer = TfidfVectorizer(min_df=15, analyzer=lemmatize_word)

# Transform X_train
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)

[nltk_data] Downloading package wordnet to /Users/kw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [43]:
X_train_lemm_vectorized.shape

(111699, 144313)

In [47]:
# Train the model with stemmed and vectorized dataset
model_lemm = LogisticRegression(max_iter=1500)
model_lemm.fit(X_train_lemm_vectorized, y_train)

# Predict the transformed test documents
predictions = model_lemm.predict(lemm_vectorizer.transform(X_test))

print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))

AUC = 0.811


In [48]:
# get the feature names as numpy array
feature_names = np.array(lemm_vectorizer.get_feature_names_out())


# Sort the coefficients from the model
sorted_coef_index = model_lemm.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}".format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['thank' 'talk' 'thanks' 'if' 'please' 'for' 'redirect' 'continue' 'help'
 'welcome']

Largest Coefs: 
['fuck' 'fucking' 'idiot' 'shit' 'stupid' 'as' 'suck' 'asshole' 'bullshit'
 'crap']




In [49]:
SVM = svm.SVC(C=1.0, kernel='linear', gamma='scale')
SVM.fit(X_train_lemm_vectorized, y_train)


print("AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))

# predict the labels on validation dataset
predictions_SVM = SVM.predict(lemm_vectorizer.transform(X_test))
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_Y)*100)

AUC = 0.811


NameError: name 'test_Y' is not defined