# Reddit Data Classification

### Reading and Analyzing the data

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Data Processing

In [143]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    # Convert to string
    comment=str(comment)
    #Convert to lower case , so that Hi and hi are the same
#     comment=comment.lower()
    #remove \n
    comment=re.sub("\\n"," ",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    # https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52644
    special_character_removal=re.compile(r'[^a-z\?\!\.\,\' ]',re.IGNORECASE)
    comment=special_character_removal.sub(" ", comment)
    
    #Split the sentences into words
    words=TweetTokenizer().tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words=[appos[word] if word in appos else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words]
#     words = [w for w in words if not w in eng_stopwords]
    
    clean_sent=" ".join(words)
    clean_sent=re.sub("\'", "", clean_sent)
    # remove any non alphanum,digit character
    #clean_sent=re.sub("\W+"," ",clean_sent)
    #clean_sent=re.sub("  "," ",clean_sent)
    return(clean_sent)

### Model Preparation

In [217]:
def get_model():
    inp = Input(shape=(maxlen, ))
    embed_size = 128
    x = Embedding(max_features, embed_size)(inp)
    x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [134]:
# standard scaler from previous training

def add_features(df):
    
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']+0.1),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

train = train_data
test = test_data

train['comment_text'] = corpus_train
test['comment_text'] = corpus_test

train = add_features(train)
test = add_features(test)

features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

y_train = train[label_cols].values
y_test = test_ground_truth[label_cols]

In [135]:
# tokenizer from previous training
max_features = 20000
maxlen = 50

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(corpus_train) + list(corpus_test))
X_train_sequence = tokenizer.texts_to_sequences(corpus_train)
X_test_sequence = tokenizer.texts_to_sequences(corpus_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))

y_train = train[label_cols].values
y_test = test_ground_truth[label_cols]

287433


In [15]:
%%time
# Load the FastText Web Crawl vectors
EMBEDDING_FILE_FASTTEXT="data/ml_models/crawl-300d-2M.vec"
EMBEDDING_FILE_TWITTER="data/ml_models/glove.twitter.27B/glove.twitter.27B.200d.txt"
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_ft = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))
embeddings_index_tw = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE_TWITTER,encoding='utf-8'))

CPU times: user 2min 52s, sys: 6.09 s, total: 2min 59s
Wall time: 2min 59s


In [16]:
%%time
spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)

CPU times: user 7min 50s, sys: 6.4 s, total: 7min 56s
Wall time: 7min 57s


In [17]:
#  https://www.kaggle.com/cpmpml/spell-checker-using-word2vec

words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

# Extend the study to Reddit data

In [137]:
reddit_toxic = pd.read_csv('data/toxic_comments_2nd.csv')
reddit_toxic['toxic'] = 1
reddit_toxic.head()

Unnamed: 0.1,Unnamed: 0,subreddit,subreddit_id,num_reports,report_reasons,created,body,created_time,toxic
0,0,theydidthefuckyou,t5_32jwi,,,1511360000.0,"Fuck him, fuck the FCC, fuck big corporate int...",2017-11-22 22:14:23,1
1,1,theydidthefuckyou,t5_32jwi,,,1511346000.0,Fuck him!,2017-11-22 18:19:27,1
2,2,theydidthefuckyou,t5_32jwi,,,1511352000.0,Fuck this guy and the high horse he rode in on,2017-11-22 20:04:24,1
3,3,theydidthefuckyou,t5_32jwi,,,1511345000.0,"Fuck You\n\n*I am a bot, and this action was p...",2017-11-22 18:05:05,1
4,4,theydidthefuckyou,t5_32jwi,,,1511367000.0,[deleted],2017-11-23 00:08:22,1


In [138]:
reddit_normal = pd.read_csv('data/reddit_data.csv')
reddit_normal['toxic'] = 0
reddit_normal.head()

Unnamed: 0.1,Unnamed: 0,subreddit,subreddit_id,num_reports,report_reasons,created,body,created_time,toxic
0,0,HistoryMemes,t5_2v2cd,,,1553417000.0,*D I D N O T C O N T A I N A S I N G L E D R O...,2019-03-24 16:49:22,0
1,1,HistoryMemes,t5_2v2cd,,,1553415000.0,r/cursedgifs,2019-03-24 16:13:52,0
2,2,HistoryMemes,t5_2v2cd,,,1553417000.0,This is a masterpiece,2019-03-24 16:43:38,0
3,3,HistoryMemes,t5_2v2cd,,,1553422000.0,HEART THE SIZE OF A PEPPERCORN,2019-03-24 18:08:48,0
4,4,HistoryMemes,t5_2v2cd,,,1553415000.0,⣿⣷⡶⠚⠉⢀⣤⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠋⠠⣴⣿⣿⣿⣿⣶⣤⣤⣤ ⠿⠥⢶⡏⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿...,2019-03-24 16:17:31,0


In [113]:
reddit_data = pd.concat([reddit_toxic, reddit_normal], axis=0)
reddit_data.head()

Unnamed: 0.1,Unnamed: 0,subreddit,subreddit_id,num_reports,report_reasons,created,body,created_time,toxic
0,0,theydidthefuckyou,t5_32jwi,,,1511360000.0,"Fuck him, fuck the FCC, fuck big corporate int...",2017-11-22 22:14:23,1
1,1,theydidthefuckyou,t5_32jwi,,,1511346000.0,Fuck him!,2017-11-22 18:19:27,1
2,2,theydidthefuckyou,t5_32jwi,,,1511352000.0,Fuck this guy and the high horse he rode in on,2017-11-22 20:04:24,1
3,3,theydidthefuckyou,t5_32jwi,,,1511345000.0,"Fuck You\n\n*I am a bot, and this action was p...",2017-11-22 18:05:05,1
4,4,theydidthefuckyou,t5_32jwi,,,1511367000.0,[deleted],2017-11-23 00:08:22,1


In [144]:
%%time
test = reddit_data
# This is time-consuming. Set the boolean check if necessary
if 0==1:
    corpus_test = [clean(text) for text in test['body']]
    
    with open('data/reddit_comment_test_with_stopword.txt', 'w', encoding='utf-8') as f:
        for comment in corpus_test:
            f.write(comment + '\n')  

CPU times: user 3.54 s, sys: 7.03 ms, total: 3.55 s
Wall time: 3.55 s


In [141]:
# Query to load model
model = get_model(features)
model.load_weights("weights.best.hdf5")

In [145]:
test['comment_text'] = corpus_test
test = add_features(test)

test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = ss.transform(test_features)

In [146]:
# For best score (Public: 9869, Private: 9865), change to max_features = 283759, maxlen = 900
max_features = 20000
maxlen = 50

# tokenizer = text.Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(corpus_train) + list(corpus_test))
# X_train_sequence = tokenizer.texts_to_sequences(corpus_train)
X_test_sequence = tokenizer.texts_to_sequences(corpus_test)

# x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))

# y_train = train[label_cols].values
y_test = test['toxic']

287433


In [None]:
%%time
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,501))

something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((501,))
something[:300,] = something_ft
something[300:500,] = something_tw
something[500,] = 0

def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,500] = last_value
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw

            
# Fasttext vector is used by itself if there is no glove vector but not the other way around.
for word, i in word_index.items():
    
    if i >= max_features: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        # change to > 20 for better score. Previously 0
        if len(word) > 20:
            embedding_matrix[i] = something
        else:
            word2 = correction(word)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something 

In [147]:
pred_prob = model.predict([x_test,test_features], batch_size=batch_size,verbose=1)
preds = np.zeros((len(test), len(label_cols)))
for i, category in enumerate(label_cols):
    preds[:,i] = [1 if x >= 0.5 else 0 for x in pred_prob[:,i]]



In [152]:
pred_prob[:,][:10]

array([[9.91953135e-01, 4.45396185e-01, 9.90529060e-01, 7.89734721e-03,
        7.36517072e-01, 2.72302926e-02],
       [9.85708833e-01, 2.15753585e-01, 9.84757602e-01, 4.60192561e-03,
        6.45035028e-01, 1.32909417e-02],
       [9.55524981e-01, 2.00703889e-01, 9.74683940e-01, 2.10027099e-02,
        4.35233355e-01, 1.30805969e-02],
       [9.14638519e-01, 7.01685548e-02, 9.53739524e-01, 1.91512704e-03,
        4.71423686e-01, 7.42796063e-03],
       [2.05942094e-02, 8.92996788e-04, 6.34765625e-03, 2.63875723e-03,
        7.01850653e-03, 3.68714333e-04],
       [5.30745685e-02, 4.95135784e-04, 1.37698352e-02, 1.66311860e-03,
        5.38703799e-03, 7.26968050e-04],
       [1.56222194e-01, 2.29719281e-03, 3.19269121e-01, 6.45220280e-05,
        2.19537616e-02, 3.96072865e-04],
       [9.98004794e-01, 7.18249917e-01, 9.94820118e-01, 4.38839197e-03,
        8.97747636e-01, 2.72137821e-02],
       [9.94788170e-01, 3.59553039e-01, 9.91478264e-01, 5.12626767e-03,
        8.30015719e-01, 

In [124]:
corpus_test[:10]

['Fuck him , fuck the FCC , fuck big corporate interest , fuck Trump for appoint him , and ya know what ? Fuck you , too , for good measure . And fuck me , too . Please ...',
 'Fuck him !',
 'Fuck this guy and the high horse he ride in on',
 'Fuck You I be a bot , and this action be perform automatically . Please contact the moderators of this subreddit message compose ? to r theydidthefuckyou if you have any question or concern .',
 'delete',
 'If that is a not I have live a life of privilege , but I had still like to screw over billions of ordinary folks for my corporate buddies kinda smile , then my butts version of fart the national anthem be head to the top of the country chart .',
 'Congratulations ! Your post reach top five in r all rise . The post be thus x post r Masub comment eq ak fuck this guy rtheydidthefuckyou to r masub . It have point in minutes when the x post be make .',
 'Fuck this fuck cunt and his fuckity fuck agenda . What a fuck fuck , god damn it fuck .',
 'Yeah

In [150]:
print(classification_report(test['toxic'].values, preds[:,0]))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94     22542
           1       0.28      0.81      0.42      1132

   micro avg       0.89      0.89      0.89     23674
   macro avg       0.63      0.85      0.68     23674
weighted avg       0.96      0.89      0.92     23674



In [205]:
test.to_csv("reddit_pred.csv")

In [203]:
test[label_cols] = pd.DataFrame(preds)

In [187]:
test = test.reset_index()

In [189]:
test[['comment_text', 'toxic', 'toxic'+'_prob', 'toxic'+'_pred', 'len_comment']].iloc[\
    np.argwhere(preds[:,0] == 0).flatten().tolist()][test['toxic']==1].to_csv(
    'reddit_toxic'+'_false_negative.csv')

# .loc[test['toxic']==1].head()

# [test['toxic']==1].to_csv(
#     'reddit_toxic'+'_false_negative.csv')