In [1]:
#pip install tensorflow-gpu
import pandas as pd
import nltk
import numpy as np
import gensim
import tensorflow as tf
from keras import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils
from keras import regularizers
from keras import optimizers
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
from keras import backend

#backend.tensorflow_backend._get_available_gpus()


def load_data(path):
    """
    Loading the data into a dataframe
    
    Input
    path: path to the test data(String)
    
    Output
    train_data: return a pandas Dataframe
    """
    train_data=pd.read_csv(path)
    print(train_data.head())
    return train_data

#referenced from https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python
def show_wordcloud(data, title = None):
    """
    depicting wordclouds of the input data
    
    Input
    data: input pandas Dataframe
    """
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

def preprocessor(text):
    """
    Tokenizing the sentences using regular expressions and NLTK library
    
    Input
    text: list of descriptions
    
    Output:
    alphabet_tokens: list of tokens
    """
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text)
    tokens=[token.lower() for token in tokens if token.isalpha()]
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    #non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    #stemmer = nltk.stem.snowball.SnowballStemmer("english")
    #stems = [str(stemmer.stem(word)) for word in non_stopwords]

    return list(alphabet_tokens)



def transform_data(train_data):
    """
    Factorizing the simplified lithologies into numerical equivalents
    
    Input
    data: input pandas dataframe
    
    Output
    tuple containing the transformed data
    """
    train_data['Item_Description']=train_data['Item_Description'].replace(np.nan,'',regex=True)
    train_data['Item_Description'] = train_data['Item_Description'].apply(preprocessor)
    train_data['Product_Category'],uniques=pd.factorize(train_data['Product_Category'])
    #train_data['Product_Category']=train_data['Product_Category'].cat.codes
    list_of_descriptions=train_data['Item_Description'].tolist()
    return (list_of_descriptions,uniques)


def generate_embeddings(list_of_descriptions):
    """
    Generating word2vec(vectorized version of each word) model from the vocabulary in the data
    
    Input
    list_of_descriptions: transformed descriptions
    list_of_simple_lithology: transformed simple lithologies
    
    Output
    model: Gensim word2vec model

    """
    data=[]
    for x in list_of_descriptions:
        temp=[]
        if(isinstance(x,list)):
            for y in x:
                temp.append(y.lower())
            data.append(temp)
    model=gensim.models.FastText(data,min_count=1,size=100,window=3)
    return model

def split_data(train_data):
    """
    Splitting the data into train and test
    
    Input
    train_data: Pandas dataframe
    
    Output
    tuple containing train and test data 
    """
    msk = np.random.rand(len(train_data)) < 0.75
    train_X = train_data.Item_Description[msk]
    test_X = train_data.Item_Description[~msk]
    y=train_data['Product_Category']
    train_y = y[msk]
    test_y = y[~msk]

    return (train_X,train_y,test_X,test_y)


def tokenize_input_data(train_X,test_X):
    """
    Indexing each token in the descriptions
    
    Input
    train_X: list of input descriptions
    test_X : list of input descriptions
    
    Output
    Tuple containing indexed versions of the inputs
    """
    tokenizer_x=Tokenizer(num_words=3000)    
    tokenizer_x.fit_on_texts(train_X)
    train_X_transformed=tokenizer_x.texts_to_sequences(train_X)
    test_X_transformed=tokenizer_x.texts_to_sequences(test_X)
    return (train_X_transformed,test_X_transformed,tokenizer_x)

def label_to_id(train_y,test_y):
    """
    Indexing each label in the target(simplified lithology)
    
    Input
    train_y: list of labels
    test_y: list of labels
    
    Output
    tuple containing indexed versions of the input
    """
    train_y_transformed=utils.to_categorical(train_y.tolist(),38,dtype='int')
    test_y_transformed=utils.to_categorical(test_y.tolist(),38,dtype='int')
    return (train_y_transformed,test_y_transformed)


def pad_sentences(train_X,test_X,maxlen):
    """
    Adding padding to the descriptions so that each description is of the same length(maxlen)
    
    Input
    train_X: list of descriptions
    test_X: list of descriptions
    maxlen: int (maximum length of the descriptions)
    
    Output
    Tuple containing transformed versions of the input
    """
    train_X_transformed= pad_sequences(train_X, padding='post', maxlen=maxlen)
    test_X_transformed= pad_sequences(test_X, padding='post', maxlen=maxlen)
    return (train_X_transformed,test_X_transformed)
    

def create_embedding_matrix(model,tokenizer):
    """
    Creating an embedding matrix to be fed into the neural network
    
    Input
    model: gensim word2vec model
    
    embedding_matrix: matrix depicting the embeddings
    """
    embedding_matrix=np.zeros((len(model.wv.vocab),100))
    for x,y in model.wv.vocab.items():
        if x in tokenizer.word_counts.keys():
            embedding_matrix[tokenizer.word_index[x]]=np.array(model.wv[x], dtype=np.float32)[:100]
    
    return embedding_matrix


def define_learning_model(model,embedding_matrix,maxlen):
    """
    Describing the deep learning model using Keras
    
    Input
    model:gensim word2vec model
    embedding_matrix: matrix of embeddings
    maxlen: maximum length of sentences
    
    Output
    lstm_model: deep learning model
    """
    lstm_model=Sequential()
    lstm_model.add(layers.Embedding(len(model.wv.vocab), 100, 
                               weights=[embedding_matrix],
                               input_length=maxlen,
                               trainable=False))
    lstm_model.add(layers.LSTM(100))
    #model.add(layers.Dropout(0.3))
    #model.add(layers.LSTM(100,activation='tanh',recurrent_activation='sigmoid'))
    lstm_model.add(layers.Dropout(0.3))

    #model.add(layers.GlobalAveragePooling1D())
    lstm_model.add(layers.Dense(38,activation='softmax'))
    #model.add(layers.Flatten())
    adam=optimizers.Adam(lr=0.001)
    lstm_model.compile(optimizer=adam,
                  loss='mse',
                  metrics=['accuracy'])
    lstm_model.summary()
    return lstm_model

def calculate_accuracy(train_X,train_y,test_X,test_y,model):
    """
    Calculating the accuracy of the model.
    
    Input
    train_X: list of descriptions
    train_y: list of labels
    
    Output:
    history: model after fitting the data
    
    """
    msk=np.random.randn(len(train_X))<0.75
    validation_data_X=train_X[~msk]
    validation_data_Y=train_y[~msk]
    history = model.fit(train_X[msk],train_y[msk],
                        epochs=10,
                        verbose=2,
                       validation_data=(validation_data_X,validation_data_Y))
    loss, accuracy = model.evaluate(train_X, train_y, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(test_X, test_y, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    
    return (history,accuracy)

#used as reference from https://www.tensorflow.org/tutorials/keras/basic_text_classification
def plot_loss(model):
    """
    Plot the training and validation loss w.r.t epochs
    
    Input
    model: deep learning model
    """
    history_dict = history.history
    history_dict.keys()
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(loss) + 1)

    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    # b is for "solid blue line"
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_data=pd.read_csv('Train.csv')

In [3]:
print(train_data.head())

   Inv_Id Vendor_Code     GL_Code    Inv_Amt  \
0       1   VENDOR-61  GL-6050100   6.973473   
1       2   VENDOR-61  GL-6050100  25.053841   
2       3  VENDOR-449  GL-6050100  53.573737   
3       4  VENDOR-682  GL-6050100  67.388827   
4       5  VENDOR-682  GL-6050100  74.262047   

                                    Item_Description Product_Category  
0  AETNA VARIABLE FUND - Apr-2002 - Store Managem...        CLASS-784  
1  AETNA VARIABLE FUND - Nov-2000 - Store Managem...        CLASS-784  
2  FAIRCHILD CORP - Nov-2001 - Store Management R...        CLASS-784  
3  CALIFORNIA REAL ESTATE INVESTMENT TRUST - Aug-...        CLASS-784  
4  CALIFORNIA REAL ESTATE INVESTMENT TRUST - Mar-...        CLASS-784  


In [4]:
train_data.groupby(('Product_Category')).describe()

Unnamed: 0_level_0,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Amt,Inv_Id,Inv_Id,Inv_Id,Inv_Id,Inv_Id,Inv_Id,Inv_Id,Inv_Id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
CLASS-1042,34.0,60.974592,25.748951,1.248599,41.644535,66.096761,80.621092,98.328181,34.0,697.617647,163.916648,556.0,568.25,581.5,899.75,909.0
CLASS-110,29.0,57.214454,29.463161,1.654231,28.918789,70.081291,78.289335,98.174961,29.0,935.206897,13.243271,914.0,925.0,935.0,948.0,956.0
CLASS-123,26.0,46.571002,26.750018,0.700048,31.596581,48.117384,66.709427,96.044006,26.0,5785.461538,2087.90165,115.0,6512.0,6521.5,6533.75,6544.0
CLASS-177,370.0,48.894782,27.985017,0.080097,24.439381,47.754154,73.085842,99.89031,370.0,395.454054,173.694369,114.0,254.25,379.5,510.75,736.0
CLASS-230,34.0,55.326168,28.631577,1.992731,32.449521,53.825759,76.759353,98.441856,34.0,7952.441176,16.951251,7925.0,7940.25,7950.0,7966.75,7984.0
CLASS-278,50.0,45.298282,24.010621,0.513286,30.048048,45.05226,62.150757,93.941683,50.0,993.32,22.003191,957.0,974.25,991.5,1010.75,1033.0
CLASS-322,107.0,50.695708,27.877681,1.684283,25.408636,50.336449,73.2903,98.886448,107.0,7259.654206,322.693005,6549.0,7068.5,7232.0,7449.0,7790.0
CLASS-323,773.0,51.228272,29.204031,0.14495,25.071323,53.654391,75.803609,99.541335,773.0,7166.063389,363.273634,6545.0,6839.0,7157.0,7488.0,7791.0
CLASS-368,79.0,51.232969,28.352466,2.888632,28.662821,52.136653,74.168341,99.356221,79.0,813.898734,229.573718,33.0,781.5,812.0,839.5,1499.0
CLASS-453,15.0,55.659948,33.572944,4.650694,23.204797,62.504676,84.50738,99.437946,15.0,3796.533333,2990.524748,554.0,748.0,6493.0,6497.5,6503.0


In [5]:
list_of_descriptions,uniques=transform_data(train_data)
print(len(list_of_descriptions))

5719


In [6]:
l=0
for i in list_of_descriptions:
    if len(i)>l:
        l=len(i)
print(l)

28


In [7]:
embedding_model=generate_embeddings(list_of_descriptions)

In [8]:
from sklearn.model_selection import KFold


In [9]:
k=KFold(n_splits=5)
ml_models_cross_valid=dict()
for x,y in k.split(train_data['Item_Description']):
    train_X,test_X=train_data['Item_Description'].values[x],train_data['Item_Description'].values[y]
    train_y,test_y=train_data['Product_Category'].values[x],train_data['Product_Category'].values[y]
    train_X,test_X,tokenizer=tokenize_input_data(train_X,test_X)
    train_X,test_X=pad_sentences(train_X,test_X,28)
    embedding_matrix=create_embedding_matrix(embedding_model,tokenizer)
    ml_model=define_learning_model(embedding_model,embedding_matrix,28)
    train_y,test_y=label_to_id(train_y,test_y)
    history,accuracy=calculate_accuracy(train_X,train_y,test_X,test_y,ml_model)
    ml_models_cross_valid[ml_model]=accuracy
    

    


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 100)           166600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 38)                3838      
Total params: 250,838
Trainable params: 84,238
Non-trainable params: 166,600
_________________________________________________________________
Train on 3495 samples, validate on 1080 samples
Epoch 1/10
 - 8s - loss: 0.0122 - acc: 0.6913 - val_loss: 0.0048 - val_acc: 0.8694
Epoch 2/10
 - 7s - loss: 0.0035 - acc: 0.9093 - val_loss: 0.0031 - val_acc: 0.9352
Epoch 3/10
 - 7s - loss: 0.0026 - acc: 0.9411 - v

In [10]:
test_data=pd.read_csv('Test.csv')

In [11]:
test_data['Item_Description']=test_data['Item_Description'].replace(np.nan,'',regex=True)
test_data['Item_Description'] = test_data['Item_Description'].apply(preprocessor)
    

In [12]:
t_x,t_y,tok_t=tokenize_input_data(test_data['Item_Description'],test_data['Item_Description'])

In [13]:
t_x,t_y=pad_sentences(t_x,t_y,28)
print(t_x.shape)

(2292, 28)


In [15]:
max_acc=0
for x,y in ml_models_cross_valid.items():
    if y>max_acc:
        max_acc=y
        ml_model=x

    

In [16]:
y_pred=ml_model.predict_proba(t_x)

In [17]:
final_op=list()
for x in y_pred:
    final_op.append(uniques[np.argmax(x)])

In [18]:
test_data['Product_Category']=pd.Series(final_op)

In [19]:
print(test_data.head())

   Inv_Id  Vendor_Code     GL_Code    Inv_Amt  \
0       6  VENDOR-1197  GL-6050100  10.916343   
1      12   VENDOR-792  GL-6050100  38.658772   
2      14   VENDOR-792  GL-6050100  46.780476   
3      18   VENDOR-792  GL-6050100   7.058866   
4      19   VENDOR-792  GL-6050100  32.931765   

                                    Item_Description Product_Category  
0  [desoto, inc, store, management, real, estate,...        CLASS-323  
1  [century, realty, trust, store, management, re...        CLASS-323  
2  [century, realty, trust, store, management, re...        CLASS-323  
3  [century, realty, trust, store, management, re...        CLASS-323  
4  [century, realty, trust, store, management, re...        CLASS-323  


In [20]:
test_data[['Inv_Id','Product_Category']].to_csv('output.csv',index=False)