In [65]:
import pandas as pd

raw_df = pd.read_json('../input/Sarcasm_Headlines_Dataset.json', lines=True)
raw_df.head(2)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0


In [66]:
from nltk.corpus import stopwords
import string
import re

cleaned_df = raw_df

cleaned_df.pop('article_link')
cleaned_df.dropna()

stop = stopwords.words('english') + list(string.punctuation)
cleaned_df['headline'] = cleaned_df['headline'].apply(lambda s: ' '.join([re.sub(r'\W+', '', word.lower()) for word in s.split(' ') if word not in stop]))

cleaned_df.head(2)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues secret black c...,0
1,roseanne revival catches thorny political mood...,0


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
import sklearn.metrics

train, test = train_test_split(cleaned_df, test_size=0.2)

reg_text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', lowercase=True)),
    ('tfidf', TfidfTransformer()),
    ('reg', SGDClassifier(learning_rate='optimal'))
])

reg_text_clf.fit(train['headline'], train['is_sarcastic'])



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [68]:
reg_predicted = reg_text_clf.predict(test['headline'])
sklearn.metrics.f1_score(test['is_sarcastic'], reg_predicted, average='micro')

0.7894047173343317

In [69]:
sentence_to_predict = ['you are very nice thank you so much !','you are very nice dumb ass !']
text_to_predict = reg_text_clf.predict(sentence_to_predict)

for i in range(0, len(text_to_predict)):
    print("%s -> %s" % (sentence_to_predict[i], ('it looks fair', 'sounds like a troll !') [text_to_predict[i]]))


you are very nice thank you so much ! -> it looks fair
you are very nice dumb ass ! -> sounds like a troll !
