In [1]:
!pip install pyprind



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import os
import pyprind 
import seaborn as sns
from pylab import rcParams
from collections import defaultdict
from scipy.optimize import curve_fit
from collections import Counter
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

!pip install --force https://github.com/chengs/tqdm/archive/colab.zip
from tqdm import tqdm_notebook as tqdm

Collecting https://github.com/chengs/tqdm/archive/colab.zip
  Downloading https://github.com/chengs/tqdm/archive/colab.zip
[K     | 481kB 4.2MB/s
Building wheels for collected packages: tqdm
  Building wheel for tqdm (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-7o0w6fh3/wheels/41/18/ee/d5dd158441b27965855b1bbae03fa2d8a91fe645c01b419896
Successfully built tqdm
Installing collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.28.1


In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [0]:
def read_caps(fname):
    """Load the captions into a dataframe"""
    vn = []
    cap = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            vn.append(pairs[0])
            cap.append(pairs[1])
        df['video']=vn
        df['caption']=cap
    return df

In [0]:
# load the captions
caption_file = '/content/drive/My Drive/CA684_Assignment_Data/dev-set/dev-set_video-captions.txt'
df_cap = read_caps(caption_file)


In [0]:
# load the ground truth values
ground_truth_file = '/content/drive/My Drive/CA684_Assignment_Data/dev-set/dev-set_ground-truth.csv'
ground_truth = pd.read_csv(ground_truth_file)

print(ground_truth)


In [8]:
from string import punctuation
counts = Counter()
# setup prograss tracker
pbar = pyprind.ProgBar(len(df_cap['caption']), title='Counting word occurrences')
for i, cap in enumerate(df_cap['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in cap]).lower()
    df_cap.loc[i,'caption'] = text
    pbar.update()
    counts.update(text.split())

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


In [9]:
from keras.preprocessing.text import Tokenizer
# build the word index
len_token = len(counts)
tokenizer = Tokenizer(num_words=len_token)
print(len_token)


5191


Using TensorFlow backend.


In [10]:
tokenizer.fit_on_texts(list(df_cap.caption.values)) #fit a list of captions to the tokenizer
#the tokenizer vectorizes a text corpus, by turning each text into either a sequence of integers

print(len(tokenizer.word_index))


5191


In [0]:
one_hot_res = tokenizer.texts_to_matrix(list(df_cap.caption.values),mode='binary')
sequences = tokenizer.texts_to_sequences(list(df_cap.caption.values))

In [12]:
#Just to visualise some stuff in sequences and counts
print(sequences[0]) # prints location of words from caption 0 'blonde woman is massaged tilt down'
print(counts['blonde']) # no. of occurences of 'blonde'
n=3
print('Least Common: ', counts.most_common()[:-n-1:-1])       # n least common elements
print('Most Common: ',counts.most_common(n))

[724, 7, 35, 884, 384, 91]
10
Least Common:  [('unknown', 1), ('handheldshowing', 1), ('longof', 1)]
Most Common:  [('in', 2105), ('the', 1835), ('on', 1724)]


In [0]:
# calculating max length
max_len = 50

In [14]:
print(sequences[0]) # length of 1st sequence

[724, 7, 35, 884, 384, 91]


In [0]:
X_seq = np.zeros((len(sequences),max_len))
for i in range(len(sequences)):
    n = len(sequences[i])
    if n==0:
        print(i)
    else:
        X_seq[i,-n:] = sequences[i]
X_seq.shape

X_seq[0,:]

Y = ground_truth[['short-term_memorability','long-term_memorability']].values
X = X_seq # sequences

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

X_train.shape

(4800, 50)

In [17]:
!pip list | grep -i keras


Keras                    2.2.4                
Keras-Applications       1.0.7                
Keras-Preprocessing      1.0.9                
keras-vis                0.4.1                


In [18]:
pip install --upgrade keras

Requirement already up-to-date: keras in /usr/local/lib/python3.6/dist-packages (2.2.4)


In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import GRU
from keras.layers import Embedding
from keras.layers import Masking
from keras.layers import LSTM
from keras.layers import Conv2D
from keras.layers import Dropout

np.random.seed(0)

model = Sequential()
# Embedding layer
model.add(Embedding(input_dim=5191, output_dim = 20, input_length=50, init='uniform'))

model.add(LSTM(200, activation='relu', recurrent_initializer='orthogonal'))
# Fully connected layer
model.add(Dense(10, activation='relu'))
# Dropout for regularization
model.add(Dropout(0.5))
# Output layer
model.add(Dense(2, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])


Instructions for updating:
Colocations handled automatically by placer.


  


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [20]:
model.fit(X_train,Y_train,epochs=5,validation_data=(X_test,Y_test))



Instructions for updating:
Use tf.cast instead.
Train on 4800 samples, validate on 1200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd5500844e0>

In [0]:
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# training_error = model.evaluate(X_train, Y_train, verbose = 1)
# print('training error = ' + str(training_error))
# testing_error = model.evaluate(X_test, Y_test, verbose = 1)
# print('training error = ' + str(testing_error))


In [22]:
def Get_score(Y_pred,Y_true):
    '''Calculate the Spearmann"s correlation coefficient'''
    Y_pred = np.squeeze(Y_pred)
    Y_true = np.squeeze(Y_true)
    if Y_pred.shape != Y_true.shape:
        print('Input shapes don\'t match!')
    else:
        if len(Y_pred.shape) == 1:
            Res = pd.DataFrame({'Y_true':Y_true,'Y_pred':Y_pred})
            score_mat = Res[['Y_true','Y_pred']].corr(method='spearman',min_periods=1)
            print('The Spearman\'s correlation coefficient is: %.3f' % score_mat.iloc[1][0])
        else:
            for ii in range(Y_pred.shape[1]):
                Get_score(Y_pred[:,ii],Y_true[:,ii])

Get_score(test_predict, Y_test)

The Spearman's correlation coefficient is: 0.413
The Spearman's correlation coefficient is: 0.199


In [23]:
predictions = model.predict(X_test)
print(predictions.shape)

(1200, 2)


In [0]:
final_results = pd.DataFrame({'video':ground_truth['video'][4800:],
                              'Short-term-pred':predictions[:,0],'Long-term-pred':predictions[:,1], 
                              'Short-term-true':ground_truth['short-term_memorability'][4800:], 
                              'Long-term-true':ground_truth['long-term_memorability'][4800:]})

final_results = final_results.reset_index(drop=True)

In [0]:
final_results.to_csv('/content/drive/My Drive/RNN_results.csv')