In [1]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import os
import re
import glob
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

from sklearn.metrics import f1_score, confusion_matrix

import glob

Using TensorFlow backend.


In [45]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit, StratifiedKFold

In [4]:
# Set Path
path = os.path.abspath('..')

## Read data

In [5]:
def dataprep_task2(path):
    """Dataprep for Task2 It will return the new data
    :param path: Path to the article's taks3 labels file.
    Example:
    >>> dataprep_task2("datasets-v5/tasks-2-3/train/article111111112.task2.labels")
    """
    dir_name = os.path.dirname(path)
    article_id = os.path.basename(path).split('.')[0]
    article_name = os.path.join(dir_name, f'{article_id}.txt')

    with open(article_name, 'r', encoding='utf8') as f:
        records = f.readlines()

    df = pd.DataFrame(records, columns=['sentences'])

    another_df = pd.read_csv(path, sep='\t', names = ['article', 'N_sentence', 'is_propaganda'], encoding='utf8')
    
    result_df = pd.concat([df, another_df], axis=1)
    
    return result_df.loc[result_df['sentences'] != '\n', :]

In [6]:
fileNames = glob.glob(os.path.join(path, 'data', 'raw', 'tasks-2-3', 'train') + "/*.task2.labels")

In [7]:
res_list = list()

for f in fileNames:
    res_list.append(dataprep_task2(f))

In [8]:
df = pd.concat(res_list)

In [9]:
df.iloc[2]['sentences']

'Pamela Geller and Robert Spencer co-founded anti-Muslim group Stop Islamization of America.\n'

In [10]:
df['sentences'] = df['sentences'].str.replace('\n', '')

In [11]:
df.shape

(14263, 4)

In [15]:
df.head()

Unnamed: 0,sentences,article,N_sentence,is_propaganda,target
0,US bloggers banned from entering UK,111111112,1,non-propaganda,0
2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda,0
4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda,1
6,They were due to speak at an English Defence L...,111111112,7,non-propaganda,0
8,A government spokesman said individuals whose ...,111111112,9,non-propaganda,0


## Process data

### Recode the label

In [16]:
df['target'] = df['is_propaganda'].map({'propaganda': 1, 'non-propaganda': 0})

### Clean the text

In [18]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [19]:
cvec = CountVectorizer()
tokenizer = cvec.build_tokenizer()

In [20]:
def clean_text(text):
    
    # Remove special chars and punctuation
    text = " ".join(tokenizer(text))
    
    # lowcase
    text = text.lower()
    
    # Lematize
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    
    # Lematize
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    
    # Remove stopwords
    text = [word for word in text if not word in stop_words]
    
    text = " ".join(text)
    
    return text

In [21]:
df['sentences_prep'] = df['sentences'].apply(clean_text)

In [22]:
df['len'] = df['sentences_prep'].apply(lambda x: len(x.split()))

In [23]:
df = df[df['len']>3]

## Make the splits

In [25]:
# The whole sample is split on 3 parts - dev, val, test
art_id_dev, art_id_val = train_test_split(df['article'].unique(), test_size = 0.25, random_state = 42)

In [26]:
print(art_id_dev.size)
print(art_id_val.size)

219
74


In [27]:
df_dev = df[df['article'].isin(art_id_dev)]
df_val = df[df['article'].isin(art_id_val)]

In [28]:
df_dev['sample'] = 'dev'
df_val['sample'] = 'val'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
# Sample sizes
print(df_dev.shape)
print(df_val.shape)

(9342, 8)
(2804, 8)


In [30]:
# Check the length
df_dev['sentences_prep'].apply(lambda x: len(x.split(" "))).mean()

13.400449582530507

In [31]:
df_dev['sentences_prep'].apply(lambda x: len(x.split(" "))).describe()

count    9342.000000
mean       13.400450
std         8.032138
min         4.000000
25%         7.000000
50%        12.000000
75%        17.000000
max        74.000000
Name: sentences_prep, dtype: float64

## Load word2vec and take the avg vector

In [32]:
# word2vec
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format(os.path.join('path', 'data', 'raw', 'GoogleNews-vectors-negative300.bin'), binary=True)
vector_dim = 300



In [35]:
df_dev['sentences_vector'] = df_dev['sentences_prep'].apply(avg_vector)
df_val['sentences_vector'] = df_val['sentences_prep'].apply(avg_vector)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
df_dev_vector_mean_list = list()
for el in df_dev['sentences_vector']:
    df_dev_vector_mean_list.append(el.tolist())
    
df_val_vector_mean_list = list()
for el in df_val['sentences_vector']:
    df_val_vector_mean_list.append(el.tolist())

In [37]:
df_dev_vector_mean_df = pd.DataFrame(df_dev_vector_mean_list, index=df_dev.index, columns=["vector_mean_"+str(i) for i in range(300)])
df_val_vector_mean_df = pd.DataFrame(df_val_vector_mean_list, index=df_val.index, columns=["vector_mean_"+str(i) for i in range(300)])

In [38]:
# Merge all together
df_dev = pd.concat([df_dev, df_dev_vector_mean_df], axis=1)
df_val = pd.concat([df_val, df_val_vector_mean_df], axis=1)

## Prepare for modelling

In [39]:
# Dev

# Prepare the X
df_dev_x = df_dev.loc[:, 'vector_mean_0':'vector_mean_299']

# Prepare the y
df_dev_y = df_dev['target'].ravel()

In [40]:
df_val = df_val[df_val['vector_mean_0'].notna()]

In [41]:
# Val

# Prepare the X
df_val_x = df_val.loc[:, 'vector_mean_0':'vector_mean_299']

# Prepare the y
df_val_y = df_val['target'].ravel()

## Modelling

In [46]:
opt_params = {'subsample': 1, 'reg_alpha': 0.05, 'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.03, 'gamma': 10, 'colsample_bytree': 1}


xgb_cl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, max_delta_step=0, min_child_weight=20,
                           missing=None, n_jobs=-1, nthread=-1, objective='binary:logistic', random_state=42, reg_lambda=1,
                           scale_pos_weight=2.3210095982936365, seed=42, silent=True, subsample = 1, reg_alpha = 0.05, n_estimators = 100, 
                           max_depth = 8, learning_rate = 0.03, gamma = 10, colsample_bytree = 1
                          )

In [47]:
xgb_cl.fit(df_dev_x, df_dev_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=10, learning_rate=0.03, max_delta_step=0,
       max_depth=8, min_child_weight=20, missing=None, n_estimators=100,
       n_jobs=-1, nthread=-1, objective='binary:logistic', random_state=42,
       reg_alpha=0.05, reg_lambda=1, scale_pos_weight=2.3210095982936365,
       seed=42, silent=True, subsample=1)

In [48]:
# dev
df_dev_y_pred = xgb_cl.predict(df_dev_x)

In [49]:
print('F1-score: {0}'.format(f1_score(df_dev_y_pred, df_dev_y)))
confusion_matrix(df_dev_y_pred, df_dev_y)

F1-score: 0.8581818181818182


array([[5731,   99],
       [ 798, 2714]], dtype=int64)

In [50]:
# val
df_val_y_pred = xgb_cl.predict(df_val_x)

In [51]:
print('F1-score: {0}'.format(f1_score(df_val_y_pred, df_val_y)))
confusion_matrix(df_val_y_pred, df_val_y)

F1-score: 0.5352622061482821


array([[1588,  297],
       [ 474,  444]], dtype=int64)