In [None]:
!pip install gradio > /dev/null

In [None]:
! python -m spacy download en 
! pip install wordcloud > /dev/null
! wget https://gitlab.com/andras.simonyi/10_days_AI_training_data/raw/master/sentiment.tsv?inline=false -O sentiment.tsv

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 6.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
--2021-08-11 15:28:12--  https://gitlab.com/andras.simonyi/10_days_AI_training_data/raw/master/sentiment.tsv?inline=false
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘sentiment.tsv’

sentiment.tsv           [ <=>                ] 437.05K  --.-KB/s    in 0.04s   


# Sentiment classification

The task is to classify one-sentence long movie reviews/opinions according to the sentiment they express. There are only two categories: positive and negative sentiment.


> "Data source: [UMICH SI650 - Sentiment Classification](https://www.kaggle.com/c/si650winter11/data)

> Training data: 7086 lines. 
  
> Format: 1|0 (tab) sentence

> Test data: 33052 lines, each contains one sentence. 

> The data was originally collected from opinmind.com (which is no longer active)."

The data is in the file "sentiment.tsv".

# Loading the data

In [None]:
import pandas as pd

df = pd.read_csv('sentiment.tsv', sep='\t', 
                 quoting=3, # Quotes are _never_ field separators
                 header=None)

df.head()

df = df[[1,0]] # reorder columns

df.rename(columns={1:"text", 0:"sentiment"}, inplace=True) # rename columns

df.head()

Unnamed: 0,text,sentiment
0,The Da Vinci Code book is just awesome.,1
1,this was the first clive cussler i've ever rea...,1
2,i liked the Da Vinci Code a lot.,1
3,i liked the Da Vinci Code a lot.,1
4,I liked the Da Vinci Code but it ultimatly did...,1


# Splitting into train, validation and test

Before doing anything else (!) we divide our data into train, validation and test parts,

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test_valid = train_test_split(df, test_size = 0.2, shuffle=True, 
                                           random_state=13) # fix the seed

df_test, df_valid = train_test_split(df_test_valid, test_size = 0.5)

print(len(df_train), len(df_valid), len(df_test))

5668 709 709


# Inspecting the data

In [None]:
df_train.describe()

Unnamed: 0,sentiment
count,5668.0
mean,0.559104
std,0.496538
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


We can examine the lengths of sentences as well.

In [None]:
n_chars = df_train.text.apply(lambda x: len(x))

n_chars.describe()

count    5668.000000
mean       60.100565
std        37.931478
min        18.000000
25%        32.000000
50%        48.000000
75%        77.000000
max       203.000000
Name: text, dtype: float64

The first sentence with the maximal length:

In [None]:
long_sentence = df_train.loc[n_chars.idxmax(), "text"]
long_sentence

'A mother in Georgia wants her local school board to take Harry Potter out of the schools and libraries because, in her opinion, reading Harry Potter leads to witchcraft, which according to her is evil...'

# Bag of words (BoW) representation of the texts

We will represent each text as a (sparse) vector of lemma (word root) counts for frequent lemmas in the training data. 

For tokenization and lemmatization we use [spaCy](https://spacy.io/), an open source Python NLP library, which can produce a list of unique lemma ids from the text.

In [None]:
import spacy

nlp = spacy.load("en", disable=["parser", "ner"]) # We need only the tokenizer

spaCy can produce spaCy Doc objects from texts that contain their linguistic analysis, among others lemmas and their unique spaCy string ids.

In [None]:
doc = nlp(long_sentence)
type(doc)

print([token.lemma_ for token in doc ]) # Lemmas

print([token.lemma for token in doc]) # Corresponding unique ids

['a', 'mother', 'in', 'Georgia', 'want', '-PRON-', 'local', 'school', 'board', 'to', 'take', 'Harry', 'Potter', 'out', 'of', 'the', 'school', 'and', 'library', 'because', ',', 'in', '-PRON-', 'opinion', ',', 'read', 'Harry', 'Potter', 'lead', 'to', 'witchcraft', ',', 'which', 'accord', 'to', '-PRON-', 'be', 'evil', '...']
[11901859001352538922, 7963322251145911254, 3002984154512732771, 309210702643012516, 7597692042947428029, 561228191312463089, 16319852998319793599, 13293160603192985325, 14899812206273857344, 3791531372978436496, 6789454535283781228, 5164779919001708464, 2416965663249996073, 1696981056005371314, 886050111519832510, 7425985699627899538, 13293160603192985325, 2283656566040971221, 1785747669126016609, 16950148841647037698, 2593208677638477497, 3002984154512732771, 561228191312463089, 14536103007527724270, 2593208677638477497, 11792590063656742891, 5164779919001708464, 2416965663249996073, 82546335403996757, 3791531372978436496, 17905374590688478165, 2593208677638477497, 

Now we have to convert these lists into BoW vectors. We could "roll our own", but, fortunately, scikit-learn has a feature extractor doing exactly that, the [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) so, for the sake of simplicity, we will use that along with spaCy.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer=lambda s: [token.lemma for token in nlp(s)], # use spaCy for analysis
                     min_df= 0.001) # Ignore lemmas with lower document frequency
cv

CountVectorizer(analyzer=<function <lambda> at 0x7f78386365f0>, binary=False,
                decode_error='strict', dtype=<class 'numpy.int64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=None, min_df=0.001, ngram_range=(1, 1),
                preprocessor=None, stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)

In [None]:
sents = ["I hate this movie.", "The movie is the worst I've seen."]
bows = cv.fit_transform(sents).toarray() # CountVectorizer produces a sparse matrix so we convert to ndarray
bows

array([[1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 2, 0, 1, 1, 1, 1, 1, 1]])

Using the CountVectorizer we convert the text columns of our train, validation and  test data into three sparse matrices.

In [None]:
bows_train = cv.fit_transform(df_train.text)
bows_train.sort_indices() # comes from TF2.0 sparse implementation, obscure requirement
bow_length = bows_train.shape[1]
print("BoW length:", bow_length)
bows_train

BoW length: 374


<5668x374 sparse matrix of type '<class 'numpy.int64'>'
	with 63241 stored elements in Compressed Sparse Row format>

In [None]:
print(bows_train[0,:])

bows_valid = cv.transform(df_valid.text)
bows_valid.sort_indices() # comes from TF2.0 sparse implementation, obscure requirement
bows_test = cv.transform(df_test.text)
bows_test.sort_indices() # comes from TF2.0 sparse implementation, obscure requirement

  (0, 14)	1
  (0, 49)	1
  (0, 54)	2
  (0, 100)	1
  (0, 186)	1
  (0, 257)	1
  (0, 365)	1


# The model

We build a feed-forward neural network for our binary classification task, which will be trained with cross-entropy loss and minibatch SGD.

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

# Parameters
############

hidden_size = 100

# Model
#######

inputs = Input(shape=(bow_length,))

# Hidden layer

hidden_output = Dense(hidden_size, activation='relu')(inputs)

# Softmax 

predictions = Dense(2, activation='softmax')(hidden_output)


# Full model

model = Model(inputs=inputs, outputs=predictions)

# Optimizer
####################

optimizer = SGD(lr=0.1)
 

# Compilation and fitting 
#########################

model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy', # we use this cross entropy variant as the input is not 
              metrics=['accuracy'])

# Training

model.fit(x=bows_train, 
          y=df_train.sentiment.values,
          validation_data=(bows_valid, df_valid.sentiment.values),
          epochs=10,
          batch_size=200)

  "The `lr` argument is deprecated, use `learning_rate` instead.")
  "shape. This may consume a large amount of memory." % value)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f77f72c7190>

In [None]:
# Implement a function, that 
# assumes model and CountVectorizer to be available in the global namespace
# receives a string input
# processes it with CountVectorizer
# takes the processed BOW representation
# feeds it to the model for prediction
# formats the prediction as a dict with the keys "Negative" and "Positive"
# 0th prediction corresponds to Negative probability, 1st to Positive
# cast the probabilities to simple Python floats, Numpy floats do NOT work.
# return this dict
def predict_sentiment(input_string):
    global model
    global cv
    goodbad = ["Negative","Positive"]
    bow = cv.transform([input_string])
    prob_pred = model.predict(bow[0])
    return {"Negative":float(prob_pred[0,0]),"Positive":float(prob_pred[0,1])}

In [None]:
# Test the function!
predict_sentiment("This film was pretty amazing.")

{'Negative': 0.4342990517616272, 'Positive': 0.5657010078430176}

#Task1

In [None]:
# Import Gradio, and build an interface
# Input is a textbox, outputs are "label"-s, and interpretation is set to default.

import #import gradio
iface = #create an interface

# Launch the interface, possibly use debug=True to make your life easier!
iface #try to launch it

Additional tasks: 

- Use the model interpretation tool of Gradio to observe some counterexamples, that do not work well!
- Set the flagging folder and use the flagging capability of Gradio to collect 15 badly behaving examples in a CSV file.

#Task2

In [None]:
from urllib.request import urlretrieve
urlretrieve("https://gr-models.s3-us-west-2.amazonaws.com/mnist-model.h5", "mnist-model.h5")
model = tf.keras.models.load_model("mnist-model.h5")

In [None]:
def recognize_digit(image):
    image = #reshape
    prediction = #predict the model 
    return {str(i): prediction[i] for i in range(10)}

In [None]:
im = gr.inputs.Image(shape=..., image_mode=..., invert_colors=False, source="canvas")

In [None]:
iface = #create an interface for handwritten digits
iface #try to launch it