In [1]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

from sklearn.metrics import f1_score, confusion_matrix

Using TensorFlow backend.


In [2]:
# Set Path
path = os.path.abspath('..')

## Read data

In [3]:
df = pd.read_csv(os.path.join(path, 'data', 'raw', 'task1.train.txt'), delimiter='\t', names=['article', 'id', 'label'])

In [4]:
df

Unnamed: 0,article,id,label
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda
1,A recent post in The Farmington Mirror — our t...,731714618,non-propaganda
2,"President Donald Trump, as he often does while...",731714635,non-propaganda
3,"February is Black History Month, and nothing l...",728627182,non-propaganda
4,"The snow was so heavy, whipped up by gusting w...",728627443,non-propaganda
5,Four months after the Sandy Hook School shooti...,732126660,non-propaganda
6,The first major newspaper article about Donald...,728144791,non-propaganda
7,"For three years, starting in 2008, New York ar...",728605281,non-propaganda
8,President Donald Trump's tumultuous administra...,731383701,non-propaganda
9,With Hartford on edge about the future of Aetn...,734075146,non-propaganda


In [5]:
df.shape

(35986, 3)

In [6]:
df['article'][0]

'Et tu, Rhody?  A recent editorial in the Providence Journal cataloged everything it could find wrong with Connecticut and ended with this suggestion: “Gov. Gina Raimondo should see if at least some of those jobs could come to Rhode Island. It is certainly less risky than the Nutmeg State.”  We beg your pardon.  The state with world-famous pension problems and persistent economic issues of its own is “less risky”?  The Journal itself reported just a few weeks ago on Rhode Island’s own significant economic problems, which in many ways reflect Connecticut’s.  Rhode Island enjoys a legacy of corruption that not even Connecticut can match. The ProJo won a Pulitzer Prize in 1994 for uncovering widespread corruption within its own court system.  What, exactly, is to be gained from moving to Rhode Island?  Like Connecticut, Rhode Island has an income tax and an estate tax with comparable rates. (Forbes magazine listed it as one of the states “Where Not To Die.” Connecticut made the list, too.

In [7]:
df['label'].value_counts()

non-propaganda    31965
propaganda         4021
Name: label, dtype: int64

## Process data

### Recode the label

In [8]:
df['target'] = df['label'].map({'propaganda': 1, 'non-propaganda': 0})

### Clean the text

In [9]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [10]:
cvec = CountVectorizer()
tokenizer = cvec.build_tokenizer()

In [11]:
def clean_text(text):
    
    # Remove special chars and punctuation
    text = " ".join(tokenizer(text))
    
    # lowcase
    text = text.lower()
    
    # Lematize
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    
    # Lematize
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    
    # Remove stopwords
    text = [word for word in text if not word in stop_words]
    
    text = " ".join(text)
    
    return text

In [12]:
df['article_prep'] = df['article'].apply(clean_text)

In [13]:
df

Unnamed: 0,article,id,label,target,article_prep
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda,0,et tu rhody recent editorial providence journa...
1,A recent post in The Farmington Mirror — our t...,731714618,non-propaganda,0,recent post farmington mirror town version oni...
2,"President Donald Trump, as he often does while...",731714635,non-propaganda,0,president donald trump often doe respond natur...
3,"February is Black History Month, and nothing l...",728627182,non-propaganda,0,february black history month nothing loom larg...
4,"The snow was so heavy, whipped up by gusting w...",728627443,non-propaganda,0,snow wa heavy whip gusting wind travel wa near...
5,Four months after the Sandy Hook School shooti...,732126660,non-propaganda,0,four month sandy hook school shoot connecticut...
6,The first major newspaper article about Donald...,728144791,non-propaganda,0,first major newspaper article donald trump 197...
7,"For three years, starting in 2008, New York ar...",728605281,non-propaganda,0,three year start 2008 new york art dealer robe...
8,President Donald Trump's tumultuous administra...,731383701,non-propaganda,0,president donald trump tumultuous administrati...
9,With Hartford on edge about the future of Aetn...,734075146,non-propaganda,0,hartford edge future aetna cv health corp chie...


## Make the splits

In [14]:
# The whole sample is split on 3 parts - dev, val, test
df_dev, df_val = train_test_split(df, test_size = 0.25, random_state = 42, stratify=df['target'])

In [15]:
df_dev['sample'] = 'dev'
df_val['sample'] = 'val'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
# Sample sizes
print(df_dev.shape)
print(df_val.shape)

(26989, 6)
(8997, 6)


In [17]:
df.head()

Unnamed: 0,article,id,label,target,article_prep
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda,0,et tu rhody recent editorial providence journa...
1,A recent post in The Farmington Mirror — our t...,731714618,non-propaganda,0,recent post farmington mirror town version oni...
2,"President Donald Trump, as he often does while...",731714635,non-propaganda,0,president donald trump often doe respond natur...
3,"February is Black History Month, and nothing l...",728627182,non-propaganda,0,february black history month nothing loom larg...
4,"The snow was so heavy, whipped up by gusting w...",728627443,non-propaganda,0,snow wa heavy whip gusting wind travel wa near...


In [18]:
# Check the lengths
df_dev['article_prep'].apply(lambda x: len(x.split(" "))).mean()

347.5890177479714

In [19]:
df_dev['article_prep'].apply(lambda x: len(x.split(" "))).describe()

count    26989.000000
mean       347.589018
std        291.868117
min          4.000000
25%        171.000000
50%        281.000000
75%        449.000000
max      12122.000000
Name: article_prep, dtype: float64

## Prepare for modelling

In [20]:
features = 'article_prep'

In [21]:
# Dev

# Prepare the X
df_dev_x = df_dev[features]

# Prepare the y
df_dev_y = df_dev['target'].ravel()

In [22]:
# Val

# Prepare the X
df_val_x = df_val[features]

# Prepare the y
df_val_y = df_val['target'].ravel()

### Tokenization

In [23]:
max_features = 6000
tokenizer = Tokenizer(num_words = max_features)

In [24]:
tokenizer.fit_on_texts(df_dev_x)

In [25]:
df_dev_x_tokens = tokenizer.texts_to_sequences(df_dev_x)
df_val_x_tokens = tokenizer.texts_to_sequences(df_val_x)

### Padding

In [26]:
max_sequence_length = 348

In [27]:
df_dev_x_pad = pad_sequences(df_dev_x_tokens, maxlen=max_sequence_length)
df_val_x_pad = pad_sequences(df_val_x_tokens, maxlen=max_sequence_length)

## Modelling

In [12]:
embed_size = 200

In [13]:
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))

In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         1200000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          59648     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 1,260,969
Trainable params: 1,260,969
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(df_dev_x_pad, df_dev_y, batch_size=128, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x242ffc11940>

### Check the performance

In [16]:
# dev
df_dev_y_pred = model.predict(df_dev_x_pad, verbose=1).round()



In [17]:
print('F1-score: {0}'.format(f1_score(df_dev_y_pred, df_dev_y)))
confusion_matrix(df_dev_y_pred, df_dev_y)

F1-score: 0.933853459972863


array([[23846,   263],
       [  127,  2753]], dtype=int64)

In [18]:
# val
df_val_y_pred = model.predict(df_val_x_pad, verbose=1).round()



In [19]:
print('F1-score: {0}'.format(f1_score(df_val_y_pred, df_val_y)))
confusion_matrix(df_val_y_pred, df_val_y)

F1-score: 0.8184679958027282


array([[7871,  225],
       [ 121,  780]], dtype=int64)