In [1]:
#pip install tensorflow-gpu
import pandas as pd
import nltk
import numpy as np
import gensim
import tensorflow as tf
from keras import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils
from keras import regularizers
from keras import optimizers
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
from keras import backend

#backend.tensorflow_backend._get_available_gpus()


def load_data(path):
    """
    Loading the data into a dataframe
    
    Input
    path: path to the test data(String)
    
    Output
    train_data: return a pandas Dataframe
    """
    train_data=pd.read_csv(path)
    print(train_data.head())
    return train_data

#referenced from https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python
def show_wordcloud(data, title = None):
    """
    depicting wordclouds of the input data
    
    Input
    data: input pandas Dataframe
    """
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

def preprocessor(text):
    """
    Tokenizing the sentences using regular expressions and NLTK library
    
    Input
    text: list of descriptions
    
    Output:
    alphabet_tokens: list of tokens
    """
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text)
    tokens=[token.lower() for token in tokens if token.isalpha()]
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    #non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    #stemmer = nltk.stem.snowball.SnowballStemmer("english")
    #stems = [str(stemmer.stem(word)) for word in non_stopwords]

    return list(alphabet_tokens)



def transform_data(data):
    """
    Factorizing the simplified lithologies into numerical equivalents
    
    Input
    data: input pandas dataframe
    
    Output
    tuple containing the transformed data
    """
    train_data['EssayText']=train_data['EssayText'].replace(np.nan,'',regex=True)
    train_data['EssayText'] = train_data['EssayText'].apply(preprocessor)
    list_of_descriptions=train_data['EssayText'].tolist()
    return list_of_descriptions


def generate_embeddings(list_of_descriptions):
    """
    Generating word2vec(vectorized version of each word) model from the vocabulary in the data
    
    Input
    list_of_descriptions: transformed descriptions
    list_of_simple_lithology: transformed simple lithologies
    
    Output
    model: Gensim word2vec model

    """
    data=[]
    for x in list_of_descriptions:
        temp=[]
        if(isinstance(x,list)):
            for y in x:
                temp.append(y.lower())
            data.append(temp)
    model=gensim.models.FastText(data,min_count=1,size=100,window=3)
    return model

def split_data(train_data):
    """
    Splitting the data into train and test
    
    Input
    train_data: Pandas dataframe
    
    Output
    tuple containing train and test data 
    """
    msk = np.random.rand(len(train_data)) < 0.75
    train_X = train_data.EssayText[msk]
    test_X = train_data.EssayText[~msk]
    y=train_data['avg_score']
    train_y = y[msk]
    test_y = y[~msk]

    return (train_X,train_y,test_X,test_y)


def tokenize_input_data(train_X,test_X):
    """
    Indexing each token in the descriptions
    
    Input
    train_X: list of input descriptions
    test_X : list of input descriptions
    
    Output
    Tuple containing indexed versions of the inputs
    """
    tokenizer_x=Tokenizer(num_words=3000)    
    tokenizer_x.fit_on_texts(train_X)
    train_X_transformed=tokenizer_x.texts_to_sequences(train_X)
    test_X_transformed=tokenizer_x.texts_to_sequences(test_X)
    return (train_X_transformed,test_X_transformed,tokenizer_x)

def label_to_id(train_y,test_y):
    """
    Indexing each label in the target(simplified lithology)
    
    Input
    train_y: list of labels
    test_y: list of labels
    
    Output
    tuple containing indexed versions of the input
    """
    train_y_transformed=utils.to_categorical(train_y.tolist(),14,dtype='int')
    test_y_transformed=utils.to_categorical(test_y.tolist(),14,dtype='int')
    return (train_y_transformed,test_y_transformed)


def pad_sentences(train_X,test_X,maxlen):
    """
    Adding padding to the descriptions so that each description is of the same length(maxlen)
    
    Input
    train_X: list of descriptions
    test_X: list of descriptions
    maxlen: int (maximum length of the descriptions)
    
    Output
    Tuple containing transformed versions of the input
    """
    train_X_transformed= pad_sequences(train_X, padding='post', maxlen=maxlen)
    test_X_transformed= pad_sequences(test_X, padding='post', maxlen=maxlen)
    return (train_X_transformed,test_X_transformed)
    

def create_embedding_matrix(model,tokenizer):
    """
    Creating an embedding matrix to be fed into the neural network
    
    Input
    model: gensim word2vec model
    
    embedding_matrix: matrix depicting the embeddings
    """
    embedding_matrix=np.zeros((len(model.wv.vocab),100))
    for x,y in model.wv.vocab.items():
        if x in tokenizer.word_counts.keys():
            embedding_matrix[tokenizer.word_index[x]]=np.array(model.wv[x], dtype=np.float32)[:100]
    
    return embedding_matrix


def define_learning_model(model,embedding_matrix,maxlen):
    """
    Describing the deep learning model using Keras
    
    Input
    model:gensim word2vec model
    embedding_matrix: matrix of embeddings
    maxlen: maximum length of sentences
    
    Output
    lstm_model: deep learning model
    """
    lstm_model=Sequential()
    lstm_model.add(layers.Embedding(len(model.wv.vocab), 100, 
                               weights=[embedding_matrix],
                               input_length=maxlen,
                               trainable=False))
    lstm_model.add(layers.LSTM(100))
    #model.add(layers.Dropout(0.3))
    #model.add(layers.LSTM(100,activation='tanh',recurrent_activation='sigmoid'))
    lstm_model.add(layers.Dropout(0.3))

    #model.add(layers.GlobalAveragePooling1D())
    lstm_model.add(layers.Dense(1,activation='linear'))
    #model.add(layers.Flatten())
    adam=optimizers.Adam(lr=0.001)
    lstm_model.compile(optimizer=adam,
                  loss='mse',
                  metrics=['accuracy'])
    lstm_model.summary()
    return lstm_model

def calculate_accuracy(train_X,train_y,model):
    """
    Calculating the accuracy of the model.
    
    Input
    train_X: list of descriptions
    train_y: list of labels
    
    Output:
    history: model after fitting the data
    
    """
    msk=np.random.randn(len(train_X))<0.75
    validation_data_X=train_X[~msk]
    validation_data_Y=train_y[~msk]
    history = model.fit(train_X[msk],train_y[msk],
                        epochs=10,
                        verbose=2,
                       validation_data=(validation_data_X,validation_data_Y))
    loss, accuracy = model.evaluate(train_X, train_y, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(test_X, test_y, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    
    return history

#used as reference from https://www.tensorflow.org/tutorials/keras/basic_text_classification
def plot_loss(model):
    """
    Plot the training and validation loss w.r.t epochs
    
    Input
    model: deep learning model
    """
    history_dict = history.history
    history_dict.keys()
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(loss) + 1)

    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    # b is for "solid blue line"
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_data=load_data('../Q2/train_dataset.csv')

   ID  Essayset  min_score  max_score  score_1  score_2  score_3  score_4  \
0   1       1.0          0          3        1        1      1.0      1.0   
1   2       1.0          0          3        1        1      NaN      1.5   
2   3       1.0          0          3        1        1      1.0      1.0   
3   4       1.0          0          3        0        0      0.0      0.0   
4   5       1.0          0          3        2        2      2.0      2.5   

   score_5        clarity       coherent  \
0      1.0        average          worst   
1      1.0      excellent          worst   
2      1.5          worst  above_average   
3      1.0          worst          worst   
4      1.0  above_average          worst   

                                           EssayText  
0  Some additional information that we would need...  
1  After reading the expirement, I realized that ...  
2  What you need is more trials, a control set up...  
3  The student should list what rock is better an...

In [3]:
train_data['avg_score']=train_data[['score_1','score_2','score_3','score_4','score_5']].mean(axis=1)

In [4]:
train_data['clarity']=pd.Categorical(train_data['clarity'])
train_data['clarity']=train_data['clarity'].cat.codes

In [5]:
train_data['coherent']=pd.Categorical(train_data['coherent'])
train_data['coherent']=train_data['coherent'].cat.codes

In [6]:
from sklearn import svm

In [7]:
# SVM for using features apart from the essay to calculate the final score
msk = np.random.rand(len(train_data)) < 0.75
train_X_svm = train_data[['clarity','coherent']][msk]
test_X_svm = train_data[['clarity','coherent']][~msk]
y=train_data['avg_score']
train_y_svm = y[msk]
test_y_svm = y[~msk]


In [8]:
clf=svm.SVR(C=10)

In [9]:
clf.fit(train_X_svm,train_y_svm)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
from sklearn.metrics import mean_squared_error

In [11]:
y_pred=clf.predict(test_X_svm)
print(mean_squared_error(test_y_svm,y_pred))

0.19544847382324498


In [12]:
list_of_essays=transform_data(train_data)

In [13]:
embedding_model=generate_embeddings(list_of_essays)

In [14]:
train_X,train_y,test_X,test_y=split_data(train_data)

In [15]:
train_X,test_X,tokenizer=tokenize_input_data(train_X,test_X)

In [16]:
train_X,test_X=pad_sentences(train_X,test_X,100)

In [17]:
embedding_matrix=create_embedding_matrix(embedding_model,tokenizer)

In [18]:
ml_model=define_learning_model(embedding_model,embedding_matrix,100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1536600   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,617,101
Trainable params: 80,501
Non-trainable params: 1,536,600
_________________________________________________________________


In [19]:
history=calculate_accuracy(train_X,train_y,ml_model)

Train on 9926 samples, validate on 2864 samples
Epoch 1/10
 - 69s - loss: 0.5149 - acc: 0.1573 - val_loss: 0.4197 - val_acc: 0.1432
Epoch 2/10
 - 83s - loss: 0.3884 - acc: 0.1905 - val_loss: 0.3423 - val_acc: 0.1878
Epoch 3/10
 - 62s - loss: 0.3557 - acc: 0.1968 - val_loss: 0.3421 - val_acc: 0.1920
Epoch 4/10
 - 66s - loss: 0.3337 - acc: 0.2011 - val_loss: 0.3114 - val_acc: 0.1997
Epoch 5/10
 - 64s - loss: 0.3175 - acc: 0.2089 - val_loss: 0.3233 - val_acc: 0.1858
Epoch 6/10
 - 65s - loss: 0.3057 - acc: 0.2104 - val_loss: 0.3282 - val_acc: 0.2025
Epoch 7/10
 - 63s - loss: 0.2925 - acc: 0.2169 - val_loss: 0.3361 - val_acc: 0.1910
Epoch 8/10
 - 66s - loss: 0.2841 - acc: 0.2206 - val_loss: 0.2807 - val_acc: 0.2109
Epoch 9/10
 - 62s - loss: 0.2670 - acc: 0.2257 - val_loss: 0.2918 - val_acc: 0.1994
Epoch 10/10
 - 63s - loss: 0.2599 - acc: 0.2273 - val_loss: 0.2703 - val_acc: 0.2088
Training Accuracy: 0.2292
Testing Accuracy:  0.2062


In [24]:
test_data=load_data('../Q2/test_dataset.csv')

     ID  Essayset  min_score  max_score        clarity       coherent  \
0  1673         1          0          3        average          worst   
1  1674         1          0          3        average          worst   
2  1675         1          0          3  above_average  above_average   
3  1676         1          0          3          worst          worst   
4  1677         1          0          3          worst          worst   

                                           EssayText  
0  The procedures I think they should have includ...  
1  In order to replicate this experiment, you wou...  
2  In order to replicate their experiment, you wo...  
3  Pleace a simple of one material into one conta...  
4  Determin the mass of four different samples ma...  


In [26]:
test_data['EssayText']=test_data['EssayText'].replace(np.nan,'',regex=True)
test_data['EssayText']=test_data['EssayText'].apply(preprocessor)

In [27]:
test_data['clarity']=pd.Categorical(test_data['clarity'])
test_data['clarity']=test_data['clarity'].cat.codes
test_data['coherent']=pd.Categorical(test_data['coherent'])
test_data['coherent']=test_data['coherent'].cat.codes

In [28]:
score_pred_svm=clf.predict(test_data[['clarity','coherent']])

In [29]:
test=tokenizer.texts_to_sequences(test_data['EssayText'])

In [37]:
test=pad_sequences(test,padding='post',maxlen=100)
print(test.shape)
print(train_X.shape)

(5224, 100)
(12790, 100)


In [38]:
score_pred_lstm=ml_model.predict(test)

In [56]:
final_score=[]
i=0
for (x,y),value in np.ndenumerate(score_pred_lstm):
    final_score.append(0.3*value+0.7*score_pred_svm[i])
    i+=1


In [57]:
test_data['essay_score']=pd.Series(final_score)
test_data['essay_score']=test_data['essay_score'].apply(np.round).astype(int)

In [58]:
print(test_data.head())

     ID  Essayset  min_score  max_score  clarity  coherent  \
0  1673         1          0          3        1         3   
1  1674         1          0          3        1         3   
2  1675         1          0          3        0         0   
3  1676         1          0          3        3         3   
4  1677         1          0          3        3         3   

                                           EssayText  essay_score  
0  [the, procedures, i, think, they, should, have...            0  
1  [in, order, to, replicate, this, experiment, y...            1  
2  [in, order, to, replicate, their, experiment, ...            2  
3  [pleace, a, simple, of, one, material, into, o...            0  
4  [determin, the, mass, of, four, different, sam...            0  


In [59]:
test_data[['ID','Essayset','essay_score']].to_csv('output.csv',index=False,header=['id','essay_set','essay_score'])