In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import regex as re
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')


from gensim.models import Word2Vec


import warnings 
warnings.filterwarnings(action = 'ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gravi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
reddit = pd.read_csv('reddit_comments.csv', index_col=0)

reddit.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [4]:
reddit.shape

(1010714, 2)

In [5]:
# custom tokenizer function

def tokenizer(series):
    # get values from series
    corpus = series.values
    tokens = [] # empty token list
    
    for document in corpus:
        #removing punctuation
        for punc in string.punctuation:
            document = document.replace(punc, '')
        # removing numbers and setting all documents to lowercase    
        document = re.sub("\d+", "", document).lower()
        # splitting documents and appending tokens list
        tokens.append(document.split(' '))
        
    
        

    return tokens

In [6]:
# setting X and y
X = reddit['comment']
y = reddit['label']

In [7]:
# tokenizing our comments
tokenized_data = tokenizer(X)

In [8]:
# reinstantiating more precise word2vec 
model = Word2Vec(tokenized_data, window=5, min_count=25, workers=4, size=300, max_vocab_size=10000 )
model.train(tokenized_data, total_examples=model.corpus_count, epochs=model.epochs)

(32246745, 52702745)

In [9]:
def sentence(document):
    size = model.vector_size
    word_vec_document = np.zeros(size)
    count = 1
    
    for word in document:
        if word in model:
            count +=1
            word_vec_document += model[word]
    
    word_vec_document = word_vec_document / count
    
    return word_vec_document

In [10]:
# setting vectors to variable
document_vectors = [sentence(doc) for doc in tokenized_data]

In [16]:
reddit_samples = pd.read_csv('sample_comments.csv', index_col=0)

In [18]:
X_samp = reddit_samples['comment']
y_samp = reddit_samples['label']

In [25]:
sample_tokens = tokenizer(X_samp)
sample_vectors = [sentence(doc) for doc in sample_tokens]

In [29]:
X_samp = sample_vectors

In [30]:
from sklearn.model_selection import train_test_split
# initial split into rem and test
X_rem, X_test, y_rem, y_test = train_test_split(X_samp, y_samp, test_size=0.25, random_state=42)

In [31]:
# secondary split into train and val
X_train, X_val, y_train, y_val = train_test_split(X_rem, y_rem, test_size=0.25, random_state=42)

In [21]:
from sklearn.neural_network import MLPClassifier

In [22]:
neural_network = MLPClassifier(verbose=True, max_iter=100, random_state=42)

In [23]:
neural_network.fit(X, y)

Iteration 1, loss = 0.59270900
Iteration 2, loss = 0.57508077
Iteration 3, loss = 0.56878644
Iteration 4, loss = 0.56474797
Iteration 5, loss = 0.56199006
Iteration 6, loss = 0.55965116
Iteration 7, loss = 0.55777380
Iteration 8, loss = 0.55630830
Iteration 9, loss = 0.55489859
Iteration 10, loss = 0.55362951
Iteration 11, loss = 0.55265670
Iteration 12, loss = 0.55159636
Iteration 13, loss = 0.55079818
Iteration 14, loss = 0.54998870
Iteration 15, loss = 0.54924019
Iteration 16, loss = 0.54843314
Iteration 17, loss = 0.54782395
Iteration 18, loss = 0.54719014
Iteration 19, loss = 0.54660270
Iteration 20, loss = 0.54616194
Iteration 21, loss = 0.54571542
Iteration 22, loss = 0.54524178
Iteration 23, loss = 0.54491333
Iteration 24, loss = 0.54450961
Iteration 25, loss = 0.54401926
Iteration 26, loss = 0.54374856
Iteration 27, loss = 0.54343988
Iteration 28, loss = 0.54310917
Iteration 29, loss = 0.54287270
Iteration 30, loss = 0.54262736
Iteration 31, loss = 0.54240404
Iteration 32, los

MLPClassifier(max_iter=100, random_state=42, verbose=True)

In [32]:
print(f'Train: {neural_network.score(X_train, y_train)}')
print(f'Val: {neural_network.score(X_val, y_val)}')

Train: 0.735342747183936
Val: 0.7309510743383004


In [35]:
val_probas = neural_network.predict_proba(X_val)

In [36]:
val_probas[:5]

array([[0.03606707, 0.96393293],
       [0.2862299 , 0.7137701 ],
       [0.11738899, 0.88261101],
       [0.08163301, 0.91836699],
       [0.03943854, 0.96056146]])

In [54]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [38]:
val_probas = val_probas[:,0]

In [39]:
val_probas[:5]

array([0.03606707, 0.2862299 , 0.11738899, 0.08163301, 0.03943854])

In [41]:
val_predict = neural_network.predict(X_val)

In [43]:
val_predict[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [57]:
# acc_score = accuracy_score(y_val, val_predict)
# precision = precision_score(y_val, val_predict)
# recall = recall_score(y_val, val_predict)
# f1 = f1_score(y_val, val_predict)

# print('VALIDATION SUMMARY: \n')
# print(f'Accuracy: {round(acc_score, 2)} \n Precision: {round(precision, 2)} \n Recall: {round(recall,2)}  \n F1: {round(f1, 2)}')

In [56]:
report = classification_report(y_val, val_predict, target_names=['Non-Sarcastic', 'Sarcastic'])
print(report)


               precision    recall  f1-score   support

Non-Sarcastic       0.72      0.76      0.74     23647
    Sarcastic       0.75      0.70      0.72     23731

     accuracy                           0.73     47378
    macro avg       0.73      0.73      0.73     47378
 weighted avg       0.73      0.73      0.73     47378

