As usual we start loading the packages that we will use in our notebook

In [38]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
from nltk.corpus import stopwords #provides list of english stopwords
stop = stopwords.words('english')

In [39]:
#LET'S PRINT THE VERSION!!
tf.__version__

'2.0.0'

In [40]:
#PLEASE DOWNLOAD THE FILE HERE: https://www.kaggle.com/aashita/nyt-comments
train_df = pd.read_csv('CommentsApril2017.csv').sample(n=50000)

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
len(train_df)

50000

In [42]:
classes = len(np.unique(train_df['newDesk'])) #we print the length, not a big one but sufficient

A key step is to label encode the target variable from text to number


In [43]:
Y = train_df['newDesk']
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(
    Y,
    num_classes=classes#equals to the number of languages
    
)

As we mentioned in the slides we will perform the previous text processing steps except for stopword removal.

In [44]:
train_df['sentence_lower'] = train_df["commentBody"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")
train_df['sentence_no_punctuation'] = train_df['sentence_no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [45]:
max_features=5000 #we set maximum number of words to 5000
maxlen=100 #we set maximum sequence length to 400

In [46]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [47]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text

In [48]:
print(len(tok.word_index))
vocab_size = len(tok.word_index)+1
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

110867


In [49]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step

In [50]:

from sklearn.model_selection import train_test_split #divide into train and test set

In [51]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [52]:
embedding_dim = 50 #this is the final dimension of the embedding space.


Let's write down the model

In [53]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
  tf.keras.layers.Flatten(), #flatten layer

  tf.keras.layers.Dense(classes, activation=tf.nn.softmax) #ouput layer a Dense layer with 4 probabilities
  #we also define our final activation function which is the softmax function typical for multiclass
  #classifiction problems

])

In [54]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy', #we recommend this loss function you
              metrics=['accuracy'])



In [55]:
model.summary() #here we show the architecture 

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           5543400   
_________________________________________________________________
flatten_2 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 28)                140028    
Total params: 5,683,428
Trainable params: 5,683,428
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.fit(np.array(X_train), np.array(y_train), epochs=3) #let's fit the model we are not very interested in fitting 
#well the model but rather use the target variable to understand better the corpus

Train on 45000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1f577946278>

In [57]:
data = np.array(model.layers[0].get_weights())

In [58]:
data.shape

(1, 110868, 50)

In [59]:
data = data.reshape(data.shape[0]*data.shape[1], data.shape[2])

In [60]:
data = pd.DataFrame(data)

In [61]:
len(data)
data[1:].to_csv("data_pietro.csv", sep='\t', index=False, header=False)

In [62]:
meta_data = pd.DataFrame.from_dict(list(tok.word_index))
print(len(meta_data))
print(len(tok.word_index))

110867
110867


In [63]:
meta_data.columns = ['word']
#meta_data['word'] = meta_data['word'].astype(str)

In [64]:
len(meta_data)

110867

In [65]:
meta_data['word'].to_csv("meta_data_pietro.csv", index=False, header=False)

In [66]:
meta_data = pd.read_csv("meta_data_pietro.csv")
print(len(meta_data))

110866
