In [1]:
import pandas as pd, numpy as np
import re

In [2]:
train0 = pd.read_csv("train.csv")

In [3]:
train0.head()

Unnamed: 0,id,comment_text,toxic
0,e617e2489abe9bca,"""\r\n\r\n A barnstar for you! \r\n\r\n The De...",0
1,9250cf637294e09d,"""\r\n\r\nThis seems unbalanced. whatever I ha...",0
2,ce1aa4592d5240ca,"Marya Dzmitruk was born in Minsk, Belarus in M...",0
3,48105766ff7f075b,"""\r\n\r\nTalkback\r\n\r\n Dear Celestia... """,0
4,0543d4f82e5470b6,New Categories \r\n\r\nI honestly think that w...,0


In [4]:
train0.describe()

Unnamed: 0,toxic
count,5000.0
mean,0.0874
std,0.282449
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [5]:
#About 22% records are toxic
train0.describe().sum(axis=1)

count    5000.000000
mean        0.087400
std         0.282449
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
dtype: float64

#### Converting to list for easy manipulation

In [6]:
comments = train0.comment_text.values

In [7]:
len(comments)

5000

### Text clean up 
- Using Regular expressions, remove IP addresses  
- Using Regular expressions, remove URLs  
- Normalize the case  
- Remove stop words  
- Remove punctuations

Removing ip address

In [8]:
re.sub('[\d+\.{3}]\d+',"","My ip is 127.0.0.9, friend")

'My ip is , friend'

In [9]:
comments_noip = [re.sub('[\d+\.{3}]\d+',"",txt) for txt in comments]

Normalizing case

In [10]:
comments_lower = [txt.lower() for txt in comments_noip]

In [11]:
comments_lower[2:4]

['marya dzmitruk was born in minsk, belarus in march , . her mother, olga nikolaevna moroz was born in baranovichi, belarus and her father was born in brest, belarus. she is second child in the family. her parents divorced in  and soon after her father remarried and had two more children. \r\nmarya, at the age of 4 began doing gymnastics, but quit two years later because she was denied a medal in a competition, where her age was incorrectly marked. when she turned 6 years old, she got admitted to music school #4 in minsk, class of violin, and to public school # with piano classes as a main course. at the age of , marya starred in belarusfilm movie called “dunechka”. soon after she started to play in theatre and was featured in television shows. by  her mother decided to move to united states. in september of  marya went to her first american school, ingrid b. lacy middle school. she graduated in spring  and traveled back to belarus for 2 months. in august  she went to oceana high schoo

Remove URLs

In [12]:
re.sub("\w+://\S+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

'@Rahim this course rocks! '

In [13]:
comments_nourl = [re.sub("\w+://\S+","", txt) for txt in comments_lower]

In [14]:
comments_nourl = [txt.replace("\'","") for txt in comments_nourl]

Remove extra line breaks

#### Tokenize

In [15]:
from nltk.tokenize import word_tokenize

In [16]:
print(word_tokenize(comments_nourl[0]))

['``', 'a', 'barnstar', 'for', 'you', '!', 'the', 'defender', 'of', 'the', 'wiki', 'barnstar', 'i', 'like', 'your', 'edit', 'on', 'the', 'kayastha', 'page', '.', 'lets', 'form', 'a', 'solidarity', 'group', 'against', 'those', 'who', 'malign', 'the', 'article', 'and', 'its', 'subject', 'matter', '.', 'i', 'propose', 'the', 'folloing', 'name', 'for', 'the', 'group', '.', 'united', 'intellectuals', 'front', 'of', 'kayastha', 'ethinicty', 'against', 'racist', 'or', 'castist', 'abuse', '(', 'uifkearca', ')', '``']


In [17]:
comment_tokens = [word_tokenize(sent) for sent in comments_nourl]
print(comment_tokens[0])

['``', 'a', 'barnstar', 'for', 'you', '!', 'the', 'defender', 'of', 'the', 'wiki', 'barnstar', 'i', 'like', 'your', 'edit', 'on', 'the', 'kayastha', 'page', '.', 'lets', 'form', 'a', 'solidarity', 'group', 'against', 'those', 'who', 'malign', 'the', 'article', 'and', 'its', 'subject', 'matter', '.', 'i', 'propose', 'the', 'folloing', 'name', 'for', 'the', 'group', '.', 'united', 'intellectuals', 'front', 'of', 'kayastha', 'ethinicty', 'against', 'racist', 'or', 'castist', 'abuse', '(', 'uifkearca', ')', '``']


### Remove stop words and punctuations

In [18]:
from nltk.corpus import stopwords
from string import punctuation

In [19]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [20]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [21]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [22]:
del_stop(comment_tokens[1])

['seems',
 'unbalanced',
 'whatever',
 'said',
 'mathsci',
 'said',
 'far',
 'extreme',
 'unpleasant',
 'things',
 'mention',
 'others',
 'much',
 'greater',
 'frequency',
 'im',
 'happy',
 'reign',
 'thats',
 'youd',
 'like',
 'ruth',
 'told',
 'trying',
 'get',
 'mathsci',
 'pay',
 'attention',
 'stop',
 'uncivil',
 'would',
 'expect',
 'issue',
 'request',
 'mathsci',
 'intentionally',
 'unbalanced',
 'whatever',
 'reason',
 'please',
 'let',
 'know',
 'voluntarily',
 'close',
 'account',
 'move',
 'things',
 'like',
 'wikipedia',
 'lot',
 'contribute',
 'way',
 'point',
 'contributing',
 'project',
 'editors',
 'administrative',
 'leave',
 'aggressively',
 'rude',
 'im',
 'good',
 'editor',
 'dont',
 'really',
 'deserve',
 'people',
 'riding',
 'ass',
 'every',
 'time',
 'try',
 'certain',
 'things',
 'ill',
 'happily',
 'leave',
 'hands',
 'drama-prone',
 'thats',
 'think',
 'best',
 'ludwigs2']

In [23]:
comments_clean = [del_stop(sent) for sent in comment_tokens]

### Checking out the top terms in the data

In [24]:
from collections import Counter

In [25]:
term_list = []
for sent in comments_clean:
    term_list.extend(sent)

In [26]:
res = Counter(term_list)
res.most_common(20)

[('article', 1655),
 ('page', 1495),
 ('wikipedia', 1338),
 ('talk', 1171),
 ('please', 1038),
 ('ass', 986),
 ('would', 964),
 ('fuck', 907),
 ('one', 858),
 ('like', 836),
 ('dont', 780),
 ('also', 657),
 ('think', 630),
 ('see', 630),
 ('know', 595),
 ('im', 562),
 ('edit', 560),
 ('use', 549),
 ('articles', 549),
 ('people', 538)]

Contextual stop words - "article", "page", "wikipedia", "talk", "articles", "pages"

In [27]:
stop_context = ["article", "page", "wikipedia", "talk", "articles", "pages"]

In [28]:
stop_final = stop_final + stop_context

In [29]:
comments_clean = [del_stop(sent) for sent in comment_tokens]

In [30]:
comments_clean = [" ".join(sent) for sent in comments_clean]
comments_clean[:2]

['barnstar defender wiki barnstar like edit kayastha lets form solidarity group malign subject matter propose folloing name group united intellectuals front kayastha ethinicty racist castist abuse uifkearca',
 'seems unbalanced whatever said mathsci said far extreme unpleasant things mention others much greater frequency im happy reign thats youd like ruth told trying get mathsci pay attention stop uncivil would expect issue request mathsci intentionally unbalanced whatever reason please let know voluntarily close account move things like lot contribute way point contributing project editors administrative leave aggressively rude im good editor dont really deserve people riding ass every time try certain things ill happily leave hands drama-prone thats think best ludwigs2']

##### We'll apply this function later on the test set

### Separate X and Y and perform train test split, 70-30

In [31]:
len(comments_clean)

5000

In [32]:
X = comments_clean
y = train0.toxic

Train test split

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

### Document term matrix using TfIdf

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
vectorizer = TfidfVectorizer(max_features = 4000)

In [36]:
len(X_train), len(X_test)

(3500, 1500)

In [37]:
X_train_bow = vectorizer.fit_transform(X_train)

In [38]:
X_test_bow = vectorizer.transform(X_test)

In [39]:
X_train_bow.shape, X_test_bow.shape

((3500, 4000), (1500, 4000))

### Model building

In [40]:
from sklearn import svm

In [41]:
classifier_linear = svm.SVC(kernel='linear')

In [42]:
%%time
classifier_linear.fit(X_train_bow, y_train)

Wall time: 681 ms


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
y_train_preds = classifier_linear.predict(X_train_bow)

In [44]:
y_train_preds[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [45]:
from sklearn.metrics import classification_report

In [46]:
print(classification_report(y_train, y_train_preds))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      3196
          1       0.99      0.64      0.78       304

avg / total       0.97      0.97      0.97      3500



Adjusting the class weights to improve the recall for the label

In [47]:
classifier_linear = svm.SVC(kernel='linear', class_weight="balanced")

In [48]:
%%time
classifier_linear.fit(X_train_bow, y_train)

Wall time: 881 ms


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
y_train_pred = classifier_linear.predict(X_train_bow)

In [51]:
print(classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

          0       1.00      0.99      0.99      3196
          1       0.89      0.99      0.94       304

avg / total       0.99      0.99      0.99      3500



#### Much better!

### Hyper-parameter tuning

"class_weights" was one of the many hyperparameters to tune for the SVM.  

Let's find the best hyper-parameters for the SVM classifier

In [54]:
?svm.SVC

In [55]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [56]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'C': [0.1, 1, 10,1000, 10000, 100000]
}

In [57]:
classifier_svm = svm.SVC(random_state=42, class_weight="balanced", kernel="linear")

In [58]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = classifier_svm, param_grid = param_grid, 
                          cv = StratifiedKFold(5), n_jobs = -1, verbose = 1, scoring = "recall" )

In [59]:
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   31.0s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.1, 1, 10, 1000, 10000, 100000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=1)

In [60]:
grid_search.grid_scores_



[mean: 0.51973, std: 0.01225, params: {'C': 0.1},
 mean: 0.53619, std: 0.02819, params: {'C': 1},
 mean: 0.51324, std: 0.03583, params: {'C': 10},
 mean: 0.62513, std: 0.06265, params: {'C': 1000},
 mean: 0.64480, std: 0.02953, params: {'C': 10000},
 mean: 0.64480, std: 0.02953, params: {'C': 100000}]

In [61]:
grid_search.best_estimator_

SVC(C=10000, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

### Using the best estimator to make predictions on the test set

In [62]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [63]:
print(classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

          0       0.96      0.87      0.91      1367
          1       0.33      0.68      0.44       133

avg / total       0.91      0.85      0.87      1500



### Most prominent terms in the toxic comments

In [64]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [65]:
toxic_comments = pd.Series(X_test)[y_test_pred == 1].values

In [66]:
term_list = []
for comment in toxic_comments:
    term_list.extend(word_tokenize(comment))

In [67]:
cts = Counter(term_list)

In [68]:
cts.most_common(15)

[('nigger', 184),
 ('die', 157),
 ('jim', 157),
 ('wales', 156),
 ('cuntbag', 126),
 ('fucking', 97),
 ('hate', 84),
 ('jews', 80),
 ('niggers', 80),
 ('spics', 79),
 ('minorities', 79),
 ('dont', 34),
 ('people', 21),
 ('like', 19),
 ('go', 19)]