In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
train_data = pd.read_csv("Train_Data.csv")
train_data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [3]:
train_data.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

In [4]:
train_data['is_sarcastic'].value_counts()

0    23958
1    20304
Name: is_sarcastic, dtype: int64

In [5]:
test_data = pd.read_csv("Test_Data.csv")
test_data.head()

Unnamed: 0,headline
0,area stand-up comedian questions the deal with...
1,dozens of glowing exit signs mercilessly taunt...
2,perfect response to heckler somewhere in prop ...
3,gop prays for ossoff lossoff
4,trevor noah says the scary truth about trump's...


In [6]:
#Removing all the rows with no headline
train_data = train_data[train_data['headline'] != '']
test_data = test_data[test_data['headline'] != '']

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11066 entries, 0 to 11065
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  11066 non-null  object
dtypes: object(1)
memory usage: 172.9+ KB


In [9]:
train_data_X = train_data[train_data.columns[train_data.columns != 'is_sarcastic']]
train_data_Y = train_data['is_sarcastic']

In [10]:
train_data_X.head()

Unnamed: 0,headline
0,supreme court votes 7-2 to legalize all worldl...
1,hungover man horrified to learn he made dozens...
2,emily's list founder: women are the 'problem s...
3,send your kids back to school with confidence
4,watch: experts talk pesticides and health


In [11]:
#Basic NLP Count Based Features
import string

#For training data
train_data_X['char_count'] = train_data_X['headline'].apply(len)
train_data_X['word_count'] = train_data_X['headline'].apply(lambda x: len(x.split()))
train_data_X['word_density'] = train_data_X['char_count'] / (train_data_X['word_count']+1)
train_data_X['punctuation_count'] = train_data_X['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
train_data_X['title_word_count'] = train_data_X['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
train_data_X['upper_case_word_count'] = train_data_X['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

#For Test data
test_data['char_count'] = test_data['headline'].apply(len)
test_data['word_count'] = test_data['headline'].apply(lambda x: len(x.split()))
test_data['word_density'] = test_data['char_count'] / (test_data['word_count']+1)
test_data['punctuation_count'] = test_data['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
test_data['title_word_count'] = test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
test_data['upper_case_word_count'] = test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [12]:
train_data_X.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,supreme court votes 7-2 to legalize all worldl...,53,9,5.3,1,0,0
1,hungover man horrified to learn he made dozens...,66,12,5.076923,0,0,0
2,emily's list founder: women are the 'problem s...,65,10,5.909091,4,0,0
3,send your kids back to school with confidence,45,8,5.0,0,0,0
4,watch: experts talk pesticides and health,41,6,5.857143,1,0,0


In [13]:
#Adding Sentiment Analysis

import textblob

train_data_snt = train_data_X['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
train_data_X['Polarity'] = [obj.polarity for obj in train_data_snt.values]
train_data_X['Subjectivity'] = [obj.subjectivity for obj in train_data_snt.values]

test_data_snt = test_data['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
test_data['Polarity'] = [obj.polarity for obj in test_data_snt.values]
test_data['Subjectivity'] = [obj.subjectivity for obj in test_data_snt.values]


In [14]:
train_data_X.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,supreme court votes 7-2 to legalize all worldl...,53,9,5.3,1,0,0,0.0,0.0
1,hungover man horrified to learn he made dozens...,66,12,5.076923,0,0,0,0.0,0.066667
2,emily's list founder: women are the 'problem s...,65,10,5.909091,4,0,0,0.0,0.0
3,send your kids back to school with confidence,45,8,5.0,0,0,0,0.0,0.0
4,watch: experts talk pesticides and health,41,6,5.857143,1,0,0,0.0,0.0


In [15]:
test_data.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,area stand-up comedian questions the deal with...,65,9,6.5,2,0,0,0.0,0.0
1,dozens of glowing exit signs mercilessly taunt...,65,9,6.5,0,0,0,-0.7,1.0
2,perfect response to heckler somewhere in prop ...,62,9,6.2,1,0,0,1.0,1.0
3,gop prays for ossoff lossoff,28,5,4.666667,0,0,0,0.0,0.0
4,trevor noah says the scary truth about trump's...,65,11,5.416667,1,0,0,0.0,0.8


In [16]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1, random_state=42, solver='liblinear')
lr.fit(train_data_X.drop(['headline'], axis=1), train_data_Y)

prediction = lr.predict(test_data.drop(['headline'], axis=1))
prediction_df = pd.DataFrame(prediction, columns = ['prediction'])
prediction_df

Unnamed: 0,prediction
0,0
1,1
2,1
3,0
4,1
...,...
11061,1
11062,0
11063,1
11064,0


In [17]:
prediction_df.to_csv('prediction.csv', index = False)