In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing basic dependencies and libraries.

- Numpy as we will work with vectors and arrays
- Pandas to build de dataframes
- Matplotlib for some visualizations
- Metrics for evaluation

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, classification_report

# Loading and viewing dataset

__we have 2 datasets: train and test (.csv)__

Since this will be an supervised machine learning workflow the train dataset must have the labels:
   - 1 = is sarcastic
   - 0 = not sarcastic

In [4]:
df = pd.read_csv('../input/headlines-dataset/Assignment_Train_Dataset.csv')
print(df.shape) # 44262 rows and 2 columns
df.head()

# Observing dataframe

We have a balance of 54% to 46% between "is_sarcastic" and "not sarcastic" (represented by 1 and 0, respectively)

In [5]:
df['is_sarcastic'].value_counts()

In [6]:
print("Visualizing ratio is sacarstic vs not sarcastic:\n")
count_Class = pd.value_counts(df['is_sarcastic'], sort=True)
count_Class.plot(kind = 'pie',labels=['not_sarc','sarc'], autopct='%1.0f%%,')


# Splitting data into train and test datasets.

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['headline']], df['is_sarcastic'],
                                                    test_size=0.33,
                                                    random_state=42)
X_train.shape, X_test.shape

In [8]:
# Checking for balance in label train/test sets.
from collections import Counter
Counter(y_train), Counter(y_test)

In [9]:
#Some numberization in order to apply logistic regression

import string

X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
X_train['punctuation_count'] = X_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_train['title_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_train['upper_case_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test['punctuation_count'] = X_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_test['title_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_test['upper_case_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [10]:
X_train.head()

In [11]:
# dropping columns with 0 from train and test features dataset.
X_train = X_train.drop(['punctuation_count','title_word_count','upper_case_word_count'], axis=1) 
X_test = X_test.drop(['punctuation_count','title_word_count','upper_case_word_count'], axis=1)

In [12]:
# importing and loading logistic regression model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1, random_state=42, solver='liblinear')

In [13]:
# training and predicting
lr.fit(X_train.drop(['headline'], axis=1), y_train)
predictions = lr.predict(X_test.drop(['headline'], axis=1))

# Results with logistic regression

- Precision for "is sarcastic" class of 57%. Precision is of all the headlines our model identified as sarcastic, 57% were correct.
- Recall (true positive rate) of 37%. True positive rate means that from all the headlines that really were sarcastic, our
    model could identifiy 37% of them correctly.
    
## with these results we see we can improve our model.

In [14]:
# printing results
import seaborn as sns

print(classification_report(y_test, predictions))

cm = confusion_matrix(y_test, predictions)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2, in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='')

# Improving the model using BOW and ensemble algorithms

- This approach aims to use bag of words technique using Count Vectorizer from scikit and applying ensemble algorithms to improve precision, recall and F1-Score of our model.


In [15]:
# some cleaning on text

import nltk
import re

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')


def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
      
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
       
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document

stp = np.vectorize(simple_text_preprocessor)
print(stp)

In [16]:
X_train['clean headline'] = stp(X_train['headline'].values)
X_test['clean headline'] = stp(X_test['headline'].values)

In [17]:
X_train['clean headline']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1, 1))
X_traincv = cv.fit_transform(X_train['clean headline']).toarray()
X_traincv = pd.DataFrame(X_traincv, columns=cv.get_feature_names_out())

X_testcv = cv.transform(X_test['clean headline']).toarray()
X_testcv = pd.DataFrame(X_testcv, columns=cv.get_feature_names_out())
X_traincv.head()

In [19]:
lr.fit(X_traincv, y_train)
predictions = lr.predict(X_testcv)

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

cm_lr = confusion_matrix(y_test, predictions)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cm_lr.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2, in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='')

# Using Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB

clf_MultiNB = MultinomialNB()
clf_MultiNB.fit(X_traincv, y_train)

In [21]:
MultiNB_predictions = clf_MultiNB.predict(X_testcv)

In [22]:
MultiNB_score = clf_MultiNB.score(X_testcv, y_test)
print(MultiNB_score)

In [23]:
print(classification_report(y_test, MultiNB_predictions))
cm_NB = confusion_matrix(y_test, MultiNB_predictions)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cm_NB.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2, in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='')

# Predictin on test dataset with lorgistic regression algorithm

    Between logistic regression and naivebayes, we found that lr performed a bit better

In [24]:
df_test = pd.read_csv('../input/headlines-dataset/Assignment_Test_Dataset.csv')
print(df_test.shape) 
df_test.head()

In [25]:
df_test['clean headline'] = stp(df_test['headline'].values)
df_test.head()

In [26]:
df_test.shape

In [27]:

XX_testcv = cv.transform(df_test['clean headline']).toarray()
XX_testcv = pd.DataFrame(XX_testcv, columns=cv.get_feature_names_out())

XX_testcv

In [28]:
labels = lr.predict(XX_testcv)


In [68]:
labels

In [76]:
pd.DataFrame(labels).to_csv("/kaggle/working/predictions.csv", header=['prediction'], index=False)


In [77]:
df_predicoes = pd.read_csv('/kaggle/working/predictions.csv')

In [78]:
df_predicoes.head()