# $Question_2$

### Include Necessary libraries

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk import WordNetLemmatizer, word_tokenize
import string

### Loading Our sarcasm in Python

In [50]:
path = r'sarcasm.json'
df = pd.read_json(path, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [51]:
df.shape

(28619, 3)

In [52]:
print(df[df['is_sarcastic'] == 1].shape, df[df['is_sarcastic'] == 0].shape)

(13634, 3) (14985, 3)


## Preprocessing 

In [53]:
class preprocess:
    def __init__(self, lematizer):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = lematizer
        self.text = None
    
    def assign_text(self, text):
        self.text = text    

    def normalize_tokenize(self):
        self.text = self._lower_case()
        self.text = self._remove_URLs()
        self.text = self._remove_emalis()
        self.text = self._remove_punctuations()
        self.text = self._remove_non_ascii_chars()
        self.text = self._remove_numbers()
        tokens = self._tokenizing()
        self.text = self._remove_stopwords(tokens)
        tokens = self._lemmatize_tokens(tokens)
        return tokens

    def _lower_case(self):
        return self.text.lower()
    
    def _remove_URLs(self):
        return re.sub(r'http\S+|www\S+|https\S+', '', self.text, flags=re.MULTILINE)

    def _remove_punctuations(self):
        return self.text.translate(str.maketrans('', '', string.punctuation))

    def _remove_numbers(self):
        return re.sub(r'\b\d+(?:,\d+)*(?:\.\d+)?\b', '', self.text)

    def _remove_non_ascii_chars(self):
        return self.text.encode('ascii', 'ignore').decode('ascii')

    def _tokenizing(self):
        return word_tokenize(self.text)
    
    def _lemmatize_tokens(self, tokens):
        return [self.lemmatizer.lemmatize(token) for token in tokens]

    def _remove_emalis(self):
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        return re.sub(email_pattern, '', self.text)
    
    def _remove_stopwords(self, tokens):
        return [word for word in tokens if word not in self.stop_words]

In [54]:
preprocess_cls = preprocess(WordNetLemmatizer())

In [55]:
def preprocess_headline(headline):
    preprocess_cls.assign_text(headline)
    return preprocess_cls.normalize_tokenize()

df['preprocess_headling'] = df['headline'].apply(preprocess_headline)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link,preprocess_headling
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,"[thirtysomething, scientist, unveil, doomsday,..."
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,"[dem, rep, totally, nail, why, congress, is, f..."
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,"[eat, your, veggie, deliciously, different, re..."
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,"[inclement, weather, prevents, liar, from, get..."
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,"[mother, come, pretty, close, to, using, word,..."


### For getting more balanced after splitting

In [56]:
not_sarcastic_df = df[df['is_sarcastic'] == 0]
is_sarcastic_df = df[df['is_sarcastic'] == 1]

In [57]:
X_pos_train, X_pos_test, y_pos_train, y_pos_test = train_test_split(is_sarcastic_df['preprocess_headling'], is_sarcastic_df['is_sarcastic'], test_size=0.2, random_state=2)
X_neg_train, X_neg_test, y_neg_train, y_neg_test = train_test_split(not_sarcastic_df['preprocess_headling'], not_sarcastic_df['is_sarcastic'], test_size=0.2, random_state=2)

In [58]:
X_train = pd.concat([X_pos_train, X_neg_train])
X_test = pd.concat([X_pos_test, X_neg_test])
y_train = pd.concat([y_pos_train, y_neg_train])
y_test = pd.concat([y_pos_test, y_neg_test])

In [59]:
print(y_test[y_test == 0].shape, y_test[y_test == 1].shape)

(2997,) (2727,)


### Loading GLOVE

In [60]:
path_glove_300 = r'Glove/glove.6B.300d.txt'

In [61]:
def load_glove_vectors(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as my_file:
        for line in my_file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [62]:
embeddings = load_glove_vectors(path_glove_300)

In [63]:
first_key = list(embeddings.keys())[0]
print(first_key)

the


### Create embedding for each headline with centroid words embedding in that headline

In [64]:
def create_glove_matrix(data, embeddings):
    first_key = list(embeddings.keys())[0]
    my_glove_matrix = np.zeros(shape=(len(data), len(embeddings[first_key])))
    real_index = 0
    for _,tokens in data.items():
        counter = 0
        for token in tokens:
            if token in embeddings:
                my_glove_matrix[real_index, :] += embeddings[token]
                counter += 1
        if counter > 0:
            my_glove_matrix[real_index, :] = my_glove_matrix[real_index, :]/counter
        real_index += 1
    return my_glove_matrix

In [65]:
train_glove_matrix = create_glove_matrix(X_train, embeddings)
train_glove_matrix.shape

(22895, 300)

In [66]:
test_glove_matrix = create_glove_matrix(X_test, embeddings)
test_glove_matrix.shape

(5724, 300)

### Train model

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

In [68]:
model = LogisticRegression(max_iter=10000)
model.fit(train_glove_matrix, y_train)

In [69]:
predicitons = model.predict(test_glove_matrix)

In [70]:
print(f'The recall is: {recall_score(y_test, predicitons)}')
print(f'The percision is: {precision_score(y_test, predicitons)}')
print(f'The f1 score: {f1_score(y_test, predicitons)}')
print(f'The accuracy is: {accuracy_score(y_test, predicitons)}')

The recall is: 0.7282728272827282
The percision is: 0.7517032551097653
The f1 score: 0.7398025703110449
The accuracy is: 0.7559399021663172
