In [1]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

from sklearn.metrics import f1_score, confusion_matrix

Using TensorFlow backend.


In [2]:
# Set Path
path = os.path.abspath('..')

## Read data

In [None]:
df = pd.read_csv(os.path.join(path, 'data', 'raw', 'task1.train.txt'), delimiter='\t', names=['article', 'id', 'label'])

In [None]:
df

In [None]:
df.shape

In [None]:
df['article'][0]

In [None]:
df['label'].value_counts()

## Process data

### Recode the label

In [None]:
df['target'] = df['label'].map({'propaganda': 1, 'non-propaganda': 0})

### Clean the text

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [None]:
cvec = CountVectorizer()
tokenizer = cvec.build_tokenizer()

In [None]:
def clean_text(text):
    
    # Remove special chars and punctuation
    text = " ".join(tokenizer(text))
    
    # lowcase
    text = text.lower()
    
    # Lematize
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    
    # Lematize
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    
    # Remove stopwords
    text = [word for word in text if not word in stop_words]
    
    text = " ".join(text)
    
    return text

In [None]:
df['article_prep'] = df['article'].apply(clean_text)

In [None]:
df

## Make the splits - dev, val, test

In [None]:
# The whole sample is split on 3 parts - dev, val, test
df_dev, df_val = train_test_split(df, test_size = 0.25, random_state = 42, stratify=df['target'])

In [None]:
df_dev['sample'] = 'dev'
df_val['sample'] = 'val'

In [None]:
# Sample sizes
print(df_dev.shape)
print(df_val.shape)

In [None]:
df.head()

In [None]:
# Check the length of the longest text
df_dev['article_prep'].apply(lambda x: len(x.split(" "))).mean()

In [None]:
df_dev['article_prep'].apply(lambda x: len(x.split(" "))).describe()

In [None]:
df_dev.to_pickle(os.path.join(path, 'data', 'processed', 'df_dev_v2.pkl'))
df_val.to_pickle(os.path.join(path, 'data', 'processed', 'df_val_v2.pkl'))

In [3]:
df_dev = pd.read_pickle(os.path.join(path, 'data', 'processed', 'df_dev_v2.pkl'))
df_val = pd.read_pickle(os.path.join(path, 'data', 'processed', 'df_val_v2.pkl'))

## Prepare for modelling

In [4]:
features = 'article_prep'

In [5]:
# Dev

# Prepare the X
df_dev_x = df_dev[features]

# Prepare the y
df_dev_y = df_dev['target'].ravel()

In [6]:
# Val

# Prepare the X
df_val_x = df_val[features]

# Prepare the y
df_val_y = df_val['target'].ravel()

### Tokenization

In [7]:
max_features = 6000
tokenizer = Tokenizer(num_words = max_features)

In [8]:
tokenizer.fit_on_texts(df_dev_x)

In [9]:
df_dev_x_tokens = tokenizer.texts_to_sequences(df_dev_x)
df_val_x_tokens = tokenizer.texts_to_sequences(df_val_x)

### Padding

In [10]:
max_sequence_length = 348

In [11]:
df_dev_x_pad = pad_sequences(df_dev_x_tokens, maxlen=max_sequence_length)
df_val_x_pad = pad_sequences(df_val_x_tokens, maxlen=max_sequence_length)

## Modelling

In [12]:
embed_size = 200

In [13]:
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))

In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         1200000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          59648     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 1,260,969
Trainable params: 1,260,969
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(df_dev_x_pad, df_dev_y, batch_size=128, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x242ffc11940>

### Check the performance

In [16]:
# dev
df_dev_y_pred = model.predict(df_dev_x_pad, verbose=1).round()



In [17]:
print('F1-score: {0}'.format(f1_score(df_dev_y_pred, df_dev_y)))
confusion_matrix(df_dev_y_pred, df_dev_y)

F1-score: 0.933853459972863


array([[23846,   263],
       [  127,  2753]], dtype=int64)

In [18]:
# val
df_val_y_pred = model.predict(df_val_x_pad, verbose=1).round()



In [19]:
print('F1-score: {0}'.format(f1_score(df_val_y_pred, df_val_y)))
confusion_matrix(df_val_y_pred, df_val_y)

F1-score: 0.8184679958027282


array([[7871,  225],
       [ 121,  780]], dtype=int64)

In [20]:
model.save('LSTM v2 epo2.h5')