# <p style="background-color:#F8C1EE; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;">Coleridge Initiative EDA 🔍 and LSTM Model 📈</p>

# <p style="background-color:#F8C1EE; font-family:newtimeroman; font-size:100%; text-align:center; border-radius: 15px 50px;">Please <u>upvote</u> if you find this notebook useful or interesting, I really appreciate the encouragement. Thanks!</p>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import json
from tqdm import tqdm
tqdm.pandas()

from nltk.corpus import stopwords
from unidecode import unidecode

import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud

STOPWORDS = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input

import re

import seaborn as sns

In [None]:
# reading csv files and train & test file paths
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

# Tabular Exploration

In [None]:
train_df.head()

In [None]:
train_df.info()

Here we can see that we have no null values to remove as part of the preprocessing.

In [None]:
for col in train_df.columns:
    print(f'We have {len(list(set(train_df[col].values)))} unique values in {col}')

The above analysis shows us that:
* We have some duplicate ID values
* We have some duplicate pub_title values
* We can also see that we have 130 uniqe dataset labels

In [None]:
# This method is taken from https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model#2.-Data-Exploration%F0%9F%94%8D
# So please give them an upvote if you find this part useful!

def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
    
train_df['text'] = train_df['Id'].apply(read_append_return)

### Target Distribution

We will look at the distribution of the target field's values.

In [None]:
display(train_df['cleaned_label'].value_counts())
plt.figure(figsize=(40, 30))
sns.set(font_scale = 2)
sns.histplot(train_df['cleaned_label'])
plt.xticks(rotation = 90)
plt.show();

We can see from the above that there is a significant imbalance in the classes, which we should addressed before running any models.

# Text Preprocessing and Exploration

We will now preprocess the text and explore the results.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def preprocess_text(text):
    REPLACE_BY_SPACE_RE = re.compile(r'[(){}\[\]\|@,;]')
    
    # Lowercase text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = text.replace('/', ' / ')
    
    text = unidecode(text).lower()
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    return text

In [None]:
train_df['clean_pub_title'] = train_df['pub_title'].progress_apply(clean_text)
# Note: I have commented out the preprocessing here because it was slowing the notebook down, so for publishing sake I have left this commented out.
train_df['clean_text'] = train_df['text'].progress_apply(clean_text)

In [None]:
# train_df.to_csv('clean_train.csv', index=False)

In [None]:
# Get word counts and unique word counts
train_df['length'] = train_df['clean_text'].apply(lambda x: len(x.split(' ')))
train_df['unique_word_count'] = train_df['clean_text'].apply(lambda x: len(set(x.split(' '))))

In [None]:
# Distribution plots
display(train_df['length'].describe())
plt.figure(figsize=(40, 30))
ax = sns.distplot(train_df['length'].values)
plt.show();

display(train_df['unique_word_count'].describe())
plt.figure(figsize=(40, 30))
ax = sns.distplot(train_df['unique_word_count'].values)
plt.show();

The above cells shows, as we'd expected, that there is a large imbalance in the distribution of the words where some words appear in almost every single paper. This suggests that some use of TF-IDF might help with the classification.

## Word Clouds

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

### "Pub Title" Field Word Cloud

In [None]:
# Generate word cloud
wordcloud = WordCloud(width = 3000, 
                      height = 2000, 
                      random_state=1, 
                      # background_color='salmon', 
                      colormap='Pastel1', 
                      collocations=False, 
                      stopwords = STOPWORDS).generate(' '.join(train_df['clean_pub_title'].values))
# Plot
plot_cloud(wordcloud)


### "Text" Field Word Cloud

In [None]:
# Generate word cloud
wordcloud = WordCloud(width = 3000, 
                      height = 2000, 
                      random_state=1, 
                      # background_color='salmon', 
                      colormap='Pastel1', 
                      collocations=False, 
                      stopwords = STOPWORDS).generate(' '.join(train_df['clean_text'].sample(4000).values))
# Plot
plot_cloud(wordcloud)

The above word clouds show us that there is still come cleaning required for the two fields but in general, there are some words appearing in these clouds that could be good indicators. 

# LSTM Model

Normally, I wouldn't dive straight into a LSTM model but I have experience building these and I know they can perform well for these types of tasks.

In [None]:
def tokenize(text, MAX_NB_WORDS=1000, MAX_SEQUENCE_LENGTH=25, tokenizer=None):
    #text = text.apply(preprocess_text)

    if tokenizer is None:
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=r'!"#$%&()*+,-.:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(text.values)

    # word_index = tokenizer.word_index

    X = tokenizer.texts_to_sequences(text.values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

    return X, tokenizer


def encode_labels(data, target_columns):
    """
        Inputs:
            data: All the data we are provided
            target_columns: List of all the columns that are the targets for our model to predict
        Outputs:
            y: One-hot encoding of the true labels for each sample
    """
    y = {}
    y_dummy_columns = {}
    for col in target_columns:
        dummies = pd.get_dummies(data[col])
        y_dummy_columns[col] = dummies.columns
        y[col] = dummies.values

    return y, y_dummy_columns


def prepare_data(df, feature_column, target_columns, MAX_SEQUENCE_LENGTH=25):
    X, tokenizer = tokenize(df[feature_column], MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH, MAX_NB_WORDS=1000)

    Y, Y_dummy_columns = encode_labels(df, target_columns)

    return X, Y, Y_dummy_columns, tokenizer


### Train

In [None]:
def train_test_split(X, Y, test_size=0.1):
    # Create a mask for the train test split
    np.random.seed(123)
    mask = np.random.rand(X.shape[0]) < (1 - test_size)

    # Split the data into train and test
    X_train = X[mask]
    X_test = X[~mask]

    y_train = {}
    y_test = {}
    for key, value in Y.items():
        y_train[key] = value[mask]
        y_test[key] = value[~mask]

    return X_train, X_test, y_train, y_test, mask

In [None]:
MAX_NB_WORDS = 3000  # The maximum number of words to be used. (most frequent)
MAX_SEQUENCE_LENGTH = 25    # Max number of words in each pieice of text
EMBEDDING_DIM = 300

# Randomly order the samples
train_df = train_df.sample(frac=1).reset_index(drop=True)

target_columns = ['cleaned_label']

# Get our inputs and outputs from the data
X, Y, Y_dummy_columns, tokenizer = prepare_data(train_df, 'clean_text', target_columns, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH)

# Split the inputs and outputs into our train and test sets
X_train, X_test, y_train, y_test, _ = train_test_split(X, Y, test_size=0.2)

input_length = X_train.shape[1]
output_length = y_train[next(iter(y_train))].shape[1]
output_name = list(y_train.keys())[0]

inp = Input(shape=(input_length,))
x = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(inp)
x = SpatialDropout1D(0.2)(x)
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(x)
output = Dense(output_length, activation='softmax', name=output_name)(x)

model = Model(inp, output)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks = [EarlyStopping(
    monitor='val_loss', patience=3, min_delta=0.0001)]

model.fit(X_train, y_train['cleaned_label'], epochs=100,
          batch_size=500,
          validation_split=0.1, callbacks=callbacks)

### Evaluate

In [None]:
# Evaluate it using the metric that they use in this dataset
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
preds = model.predict(X_test)

scores = []
for i in range(preds.shape[0]):
    pred = Y_dummy_columns['cleaned_label'].tolist()[np.argmax(preds[i])] 
    true = Y_dummy_columns['cleaned_label'].tolist()[np.argmax(y_test['cleaned_label'][i])]
    scores.append(jaccard(pred, true))

print(f'Score: {np.mean(scores)}')

The above score is obviously low, but I believe that with some simple tuning, and bring in the additional features we should be able to increase this quite easily. 

# Closing Remarks

This is a work in progress and I will be making regular updates to the models to improve their performance.

My next steps will be:
* Extract words unique to the papers and feed this into the model
* Train the LSTM on the text field.
* Hyperparameter tuning of the LSTM model