In [1]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import os
import re
import glob
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

from sklearn.metrics import f1_score, confusion_matrix

import glob

Using TensorFlow backend.


In [2]:
# Set Path
path = os.path.abspath('..')

## Read data

In [3]:
def dataprep_task2(path):
    """Dataprep for Task2 It will return the new data
    :param path: Path to the article's taks3 labels file.
    Example:
    >>> dataprep_task2("datasets-v5/tasks-2-3/train/article111111112.task2.labels")
    """
    dir_name = os.path.dirname(path)
    article_id = os.path.basename(path).split('.')[0]
    article_name = os.path.join(dir_name, f'{article_id}.txt')

    with open(article_name, 'r', encoding='utf8') as f:
        records = f.readlines()

    df = pd.DataFrame(records, columns=['sentences'])

    another_df = pd.read_csv(path, sep='\t', names = ['article', 'N_sentence', 'is_propaganda'], encoding='utf8')
    
    result_df = pd.concat([df, another_df], axis=1)
    
    return result_df.loc[result_df['sentences'] != '\n', :]

In [4]:
fileNames = glob.glob(os.path.join(path, 'data', 'raw', 'tasks-2-3', 'train') + "/*.task2.labels")

In [5]:
res_list = list()

for f in fileNames:
    res_list.append(dataprep_task2(f))

In [6]:
df = pd.concat(res_list)

In [7]:
df.iloc[2]['sentences']

'Pamela Geller and Robert Spencer co-founded anti-Muslim group Stop Islamization of America.\n'

In [8]:
df['sentences'] = df['sentences'].str.replace('\n', '')

In [9]:
df.shape

(14263, 4)

In [10]:
df

Unnamed: 0,sentences,article,N_sentence,is_propaganda
0,US bloggers banned from entering UK,111111112,1,non-propaganda
2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda
4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda
6,They were due to speak at an English Defence L...,111111112,7,non-propaganda
8,A government spokesman said individuals whose ...,111111112,9,non-propaganda
10,"He added: ""We condemn all those whose behaviou...",111111112,11,propaganda
12,'Right decision',111111112,13,non-propaganda
13,"Ms Geller, of the Atlas Shrugs blog, and Mr Sp...",111111112,14,propaganda
14,On both of their blogs the pair called their b...,111111112,15,propaganda
16,They were due to attend a march planned by the...,111111112,17,non-propaganda


## Process data

### Recode the label

In [11]:
df['target'] = df['is_propaganda'].map({'propaganda': 1, 'non-propaganda': 0})

In [12]:
df

Unnamed: 0,sentences,article,N_sentence,is_propaganda,target
0,US bloggers banned from entering UK,111111112,1,non-propaganda,0
2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda,0
4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda,1
6,They were due to speak at an English Defence L...,111111112,7,non-propaganda,0
8,A government spokesman said individuals whose ...,111111112,9,non-propaganda,0
10,"He added: ""We condemn all those whose behaviou...",111111112,11,propaganda,1
12,'Right decision',111111112,13,non-propaganda,0
13,"Ms Geller, of the Atlas Shrugs blog, and Mr Sp...",111111112,14,propaganda,1
14,On both of their blogs the pair called their b...,111111112,15,propaganda,1
16,They were due to attend a march planned by the...,111111112,17,non-propaganda,0


### Clean the text

In [73]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [156]:
cvec = CountVectorizer()
tokenizer = cvec.build_tokenizer()

In [107]:
def clean_text(text):
    
    # Remove special chars and punctuation
    text = " ".join(tokenizer(text))
    
    # lowcase
    text = text.lower()
    
    # Lematize
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    
    # Lematize
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    
    # Remove stopwords
    text = [word for word in text if not word in stop_words]
    
    text = " ".join(text)
    
    return text

In [16]:
df['sentences_prep'] = df['sentences'].apply(clean_text)

In [18]:
df['len'] = df['sentences_prep'].apply(lambda x: len(x.split()))

In [19]:
df = df[df['len']>3]

## Make the splits

In [22]:
# The whole sample is split on 3 parts - dev, val, test
art_id_dev, art_id_val = train_test_split(df['article'].unique(), test_size = 0.25, random_state = 42)

In [23]:
print(art_id_dev.size)
print(art_id_val.size)

219
74


In [24]:
df_dev = df[df['article'].isin(art_id_dev)]
df_val = df[df['article'].isin(art_id_val)]

In [25]:
df_dev['sample'] = 'dev'
df_val['sample'] = 'val'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
# Sample sizes
print(df_dev.shape)
print(df_val.shape)

(9342, 8)
(2804, 8)


In [27]:
# Check the length of the longest text
df_dev['sentences_prep'].apply(lambda x: len(x.split(" "))).mean()

13.400449582530507

In [28]:
df_dev['sentences_prep'].apply(lambda x: len(x.split(" "))).describe()

count    9342.000000
mean       13.400450
std         8.032138
min         4.000000
25%         7.000000
50%        12.000000
75%        17.000000
max        74.000000
Name: sentences_prep, dtype: float64

## Prepare for modelling

In [30]:
features = 'sentences_prep'

In [31]:
# Dev

# Prepare the X
df_dev_x = df_dev[features]

# Prepare the y
df_dev_y = df_dev['target'].ravel()

In [32]:
# Val

# Prepare the X
df_val_x = df_val[features]

# Prepare the y
df_val_y = df_val['target'].ravel()

### Tokenization

In [161]:
max_features = 6000
tokenizer = Tokenizer(num_words = max_features)

In [162]:
tokenizer.fit_on_texts(df_dev_x)

In [35]:
df_dev_x_tokens = tokenizer.texts_to_sequences(df_dev_x)
df_val_x_tokens = tokenizer.texts_to_sequences(df_val_x)

### Padding

In [36]:
max_sequence_length = 14

In [37]:
df_dev_x_pad = pad_sequences(df_dev_x_tokens, maxlen=max_sequence_length)
df_val_x_pad = pad_sequences(df_val_x_tokens, maxlen=max_sequence_length)

## Modelling

In [38]:
embed_size = 100

In [39]:
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

In [40]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         600000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          34048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 635,369
Trainable params: 635,369
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(df_dev_x_pad, df_dev_y, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15bc6198>

### Check the performance

In [42]:
# dev
df_dev_y_pred = model.predict(df_dev_x_pad, verbose=1).round()



In [43]:
print('F1-score: {0}'.format(f1_score(df_dev_y_pred, df_dev_y)))
confusion_matrix(df_dev_y_pred, df_dev_y)

F1-score: 0.9837685250529287


array([[6462,   25],
       [  67, 2788]], dtype=int64)

In [44]:
# val
df_val_y_pred = model.predict(df_val_x_pad, verbose=1).round()



In [45]:
print('F1-score: {0}'.format(f1_score(df_val_y_pred, df_val_y)))
confusion_matrix(df_val_y_pred, df_val_y)

F1-score: 0.413968253968254


array([[1555,  415],
       [ 508,  326]], dtype=int64)