In [2]:
import pandas as pd
import math
import re
import json
import nltk
from nltk.corpus import stopwords
dataset_file='Sarcasm_Headlines_Dataset.json'
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

In [3]:
def load_dataset(file):
    data = []
    with open(file) as f:
        for line in f.readlines():
            j = json.loads(line)
            url, headline, sarcastic = j['article_link'], j['headline'], j['is_sarcastic']
            data.append([url, headline, sarcastic])
    df=pd.DataFrame(data, columns=['article_link', 'headline', 'is_sarcastic'])
    return df

In [4]:
def Select_Features(data):
    features_to_keep=['headline','is_sarcastic']
    cols_to_drop=[]   
    for col in data.columns:
        if col not in features_to_keep:
            cols_to_drop.append(col)
            data=data.drop([col],axis=1)
    return data

In [5]:
def remove_symbols(comment):
    comment = re.sub("[']", '', comment)
    comment = re.sub("[^\w]", ' ', comment)
    return comment

In [6]:
def remove_stop_words(comment):
    splited_comment=comment.split()
    final_comment=""
    for word in splited_comment:
        if word not in stop_words:
            word=lemmitization(word)
            final_comment+=word
            final_comment+=" "
    return final_comment

In [7]:
def Pre_Processing(data):
    for i in range(0,len(data)):
        comment=remove_symbols(data.iloc[i,0])
        comment=remove_stop_words(comment)
        data.iloc[i,0]=comment.lower()
    return data
    
    

In [8]:
def lemmitization(word):
    return lemmatizer.lemmatize(word)

In [9]:
def Build_TF_Vector(d):
    tf_words={}
    checked_list=[]
    final_words=[]
    for i in d['headline'].index:
        line=d['headline'][i].split()
        for word in line:
            if word not in checked_list:
                tf_words[word]=1
                checked_list.append(word)
            else:
                tf_words[word]+=1
    return tf_words

In [10]:
def prior_probabilty(arr_data):
    probabities={}
    probabities[1]=arr_data['is_sarcastic'].value_counts()[1]
    probabities[0]=arr_data['is_sarcastic'].value_counts()[0]
    return probabities

In [11]:
def Remove_Low_Freq(tf_vector):
    final_tf={}
    for word in tf_vector:
        if(tf_vector[word]>=5):
            final_tf[word]=(tf_vector[word])
    return final_tf

In [12]:
def get_unique_words(train,vocab):
    words=[]
    for line in train['headline']:
        line=line.split()
        for word in line:
            if word in vocab and word not in words:
                words.append(word)
            
    return words            

In [13]:
def Words_Frequencies_Class(train,words):
    
    list_w=[]
    total_words_class={}
    total_words_class[0]=0
    total_words_class[1]=0
    count_words={}
    for i in range(0,len(train)):
        row=train.iloc[i]
        lines=row[0].split()
        label=row[1]
        for word in lines:
            if label==1:
                total_words_class[1]+=1
            else:
                total_words_class[0]+=1
            if word in words and word not in list_w:
                list_w.append(word)
                count_words[word]={}
                count_words[word][0]=1
                count_words[word][1]=1
            elif label==1 and word in list_w:
                count_words[word][1]+=1
            elif label==0 and word in list_w:
                count_words[word][0]+=1
                
    return total_words_class,count_words     
    

In [14]:
def Calculate_Probalities(total_count,word_count,total_unique):
    prob={}
    for word in word_count:
        prob[word]={}
        prob[word][0]=((word_count[word][0]+1)/(total_unique+total_count[0]))
        prob[word][1]=((word_count[word][1]+1)/(total_unique+total_count[1]))
    
        
    return prob

# MultiNomial Naive Bayes

Formula = Term_Freq(feature(i)+1)/(Total_Unique_Words)+(Total_Words(each_class))

In [15]:
def NaiveBayes(train,vocab):
    words=get_unique_words(train,vocab)
    total_words=len(words)
    prior_prob=prior_probabilty(train)
    total_count,word_count=Words_Frequencies_Class(train,words)
    prob=Calculate_Probalities(total_count,word_count,total_words)
    
    prob['pr_0']=prior_prob[0]
    prob['pr_1']=prior_prob[1]
    prob['t_words']=total_words
    prob['class0']=total_count[0]
    prob['class1']=total_count[1]
    return prob
        

In [16]:
def test_dataset(trained,test):
    prior_prob=prior_probabilty(train)
    true_predicted=0
    output=[]
    tf=1
    c=0
    true_prob=1
    false_prob=1
    predicted=""
    for i in range(0,len(test)):
        predicted=""
        true_prob=1
        false_prob=1
        row=test.iloc[i]
        line=row[0].split()
        actual=row[1]
        for w in line:
            if w in trained:
                true_prob*=trained[w][1]
                false_prob*=trained[w][0]
            else:
                true_prob*=(1+tf)/((trained['t_words']+trained['class1']))
                false_prob*=(1+tf)/((trained['t_words']+trained['class0']))
        true_prob=(true_prob * trained['pr_1'])
        false_prob=(false_prob * trained['pr_0'])
        if true_prob>false_prob:
            predicted=1
        else:
            predicted=0
        output.append(predicted)
    return output

In [17]:
def Accuracy(actual,predicted):
    true_predicted=0
    for i in range(0,len(predicted)):
        if(actual[i]==predicted[i]):
            true_predicted+=1
    accuracy=(true_predicted/len(test))*100
    return accuracy
    

In [18]:
def check_sentence(trained,sent):
    line=sent.split()
    true_prob=1
    false_prob=1
    tf=1
    for w in line:
        if w in trained:
            true_prob*=trained[w][1]
            false_prob*=trained[w][0]
        else:
            true_prob*=(1+tf)/((trained['t_words']+trained['class1']))
            false_prob*=(1+tf)/((trained['t_words']+trained['class0']))
    true_prob=(true_prob * trained['pr_1'])
    false_prob=(false_prob * trained['pr_0'])
    if(true_prob>false_prob):
        return "Sarcastic"
    else:
        return "Not Sarcastic"
        

In [19]:
data=load_dataset(dataset_file) # Read Data file
data

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0


In [20]:

data=Select_Features(data)

In [21]:
data


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
5,advancing the world's women,0
6,the fascinating case for eating lab-grown meat,0
7,"this ceo will send your kids to school, if you...",0
8,top snake handler leaves sinking huckabee camp...,1
9,friday's morning email: inside trump's presser...,0


In [22]:
data['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [23]:
data=Pre_Processing(data)

In [24]:
data

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sue secret black co...,0
1,roseanne revival catch thorny political mood b...,0
2,mom starting fear son web series closest thing...,1
3,boehner want wife listen come alternative debt...,1
4,j k rowling wish snape happy birthday magical ...,0
5,advancing world woman,0
6,fascinating case eating lab grown meat,0
7,ceo send kid school work company,0
8,top snake handler leaf sinking huckabee campaign,1
9,friday morning email inside trump presser age,0


In [25]:
#Remove Low Freq Words
words_tf=Build_TF_Vector(data)

In [26]:
words_tf=Remove_Low_Freq(words_tf)

In [27]:
words_tf

{'former': 106,
 'store': 64,
 'clerk': 13,
 'sue': 32,
 'secret': 118,
 'black': 276,
 'code': 21,
 'minority': 19,
 'shopper': 9,
 'revival': 11,
 'catch': 33,
 'political': 83,
 'mood': 13,
 'better': 124,
 'worse': 40,
 'mom': 267,
 'starting': 45,
 'fear': 66,
 'son': 147,
 'web': 19,
 'series': 62,
 'closest': 7,
 'thing': 361,
 'grandchild': 5,
 'boehner': 27,
 'want': 305,
 'wife': 100,
 'listen': 24,
 'come': 180,
 'alternative': 15,
 'debt': 31,
 'reduction': 5,
 'idea': 116,
 'j': 49,
 'k': 53,
 'rowling': 7,
 'wish': 76,
 'happy': 64,
 'birthday': 67,
 'magical': 32,
 'way': 392,
 'world': 384,
 'woman': 883,
 'case': 116,
 'eating': 76,
 'lab': 8,
 'grown': 18,
 'meat': 30,
 'ceo': 84,
 'send': 35,
 'kid': 256,
 'school': 302,
 'work': 218,
 'company': 131,
 'top': 141,
 'snake': 13,
 'handler': 7,
 'leaf': 70,
 'sinking': 6,
 'huckabee': 20,
 'campaign': 207,
 'friday': 52,
 'morning': 101,
 'email': 117,
 'inside': 83,
 'trump': 1686,
 'age': 82,
 'airline': 46,
 'passen

In [28]:

from sklearn.model_selection import train_test_split


In [29]:
train, test = train_test_split(data, test_size = 0.3,shuffle=True)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)


In [30]:
print("Train_Dataset: ",len(train))
print("Test_dataset",len(test))

Train_Dataset:  18696
Test_dataset 8013


In [31]:
train.head()

Unnamed: 0,headline,is_sarcastic
0,ex nfl player lament knowing cte prior footbal...,0
1,bos wax nostalgic sexual harassment suit,1
2,house republican unveil bill repeal obamacare,0
3,3 reason picked wrong doctor,0
4,american expect government official issue marr...,0


In [32]:
test.head()

Unnamed: 0,headline,is_sarcastic
0,doj monitoring investigation fatal police shoo...,0
1,investigation expose ebay user selling fake pu...,1
2,glass new accessory,0
3,middle aged cat cant begin compete adorable ki...,1
4,olympic skier stare icy forbidding slope rest ...,1


In [33]:
train_dataset=NaiveBayes(train,words_tf.keys())

In [34]:
predicted=test_dataset(train_dataset,test)

In [35]:
actual=list(test['is_sarcastic'])
acc=Accuracy(actual,predicted)

In [36]:
print("Accuracy is : ",acc)
print("Rounded-Accurcy",math.ceil(acc))

Accuracy is :  78.85935355048048
Rounded-Accurcy 79


In [37]:
test_output=test.copy()
test_output=test_output.drop(['is_sarcastic'],axis=1)
test_output['is_sarcastic']=predicted

In [43]:
test_output

Unnamed: 0,headline,is_sarcastic
0,doj monitoring investigation fatal police shoo...,0
1,investigation expose ebay user selling fake pu...,0
2,glass new accessory,1
3,middle aged cat cant begin compete adorable ki...,1
4,olympic skier stare icy forbidding slope rest ...,1
5,new u currency expires spent two week,1
6,moving new city solve area man problem,1
7,4 uncommon serious ear infection complication,0
8,stepson absolutely nailing jeopardy category t...,1
9,bush proud u economic woe still depress world ...,1


In [38]:
sent_input=input("Enter The Sentence : ")

Enter The Sentence : Liberals will not buy this book, so their opinion is worthless. Besides most of them cant even read clearly enough to get the joke


In [39]:
sent_input=remove_symbols(sent_input)
sent_input=remove_stop_words(sent_input)

In [40]:
ans=check_sentence(train_dataset,sent_input)

In [41]:
print(ans)

Sarcastic
