In [53]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import decomposition, ensemble

import pandas
import xgboost
import numpy
import textblob, string

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [174]:
# load the dataset
data = open('corpus',encoding='utf-8').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(content[1:])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [175]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

valid_y[:10]

array([0, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [176]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',tokenizer=lambda doc: doc, lowercase=False)
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [177]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000,tokenizer=lambda doc: doc, lowercase=False)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
xtrain_tfidf

<7500x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 361188 stored elements in Compressed Sparse Row format>

In [173]:


# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000,lowercase=False)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000,tokenizer=lambda doc: doc, lowercase=False)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

TypeError: expected string or bytes-like object

In [189]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec', encoding='utf-8')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [89]:
trainDF['char_count'] = trainDF['text'].apply(lambda x: len("".join(x)))
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x if wrd.isupper()]))

In [90]:
trainDF.head()

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,"[Stuning, even, for, the, non-gamer:, This, so...",__label__2,347,80,4.283951,0,10,3
1,"[The, best, soundtrack, ever, to, anything.:, ...",__label__2,413,97,4.214286,0,7,3
2,"[Amazing!:, This, soundtrack, is, my, favorite...",__label__2,632,129,4.861538,1,24,4
3,"[Excellent, Soundtrack:, I, truly, like, this,...",__label__2,626,118,5.260504,2,52,4
4,"[Remember,, Pull, Your, Jaw, Off, The, Floor, ...",__label__2,395,87,4.488636,0,30,0


In [92]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [93]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

In [94]:
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [95]:
trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
c = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [101]:
trainDF.head(50)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,"[Stuning, even, for, the, non-gamer:, This, so...",__label__2,347,80,4.283951,0,10,3,0,0,0,0,0
1,"[The, best, soundtrack, ever, to, anything.:, ...",__label__2,413,97,4.214286,0,7,3,0,0,0,0,0
2,"[Amazing!:, This, soundtrack, is, my, favorite...",__label__2,632,129,4.861538,1,24,4,0,0,0,0,0
3,"[Excellent, Soundtrack:, I, truly, like, this,...",__label__2,626,118,5.260504,2,52,4,0,0,0,0,0
4,"[Remember,, Pull, Your, Jaw, Off, The, Floor, ...",__label__2,395,87,4.488636,0,30,0,0,0,0,0,0
5,"[an, absolute, masterpiece:, I, am, quite, sur...",__label__2,684,142,4.783217,0,14,3,0,0,0,0,0
6,"[Buyer, beware:, This, is, a, self-published, ...",__label__1,600,139,4.285714,0,16,4,0,0,0,0,0
7,"[Glorious, story:, I, loved, Whisper, of, the,...",__label__2,418,105,3.943396,0,13,6,0,0,0,0,0
8,"[A, FIVE, STAR, BOOK:, I, just, finished, read...",__label__2,422,103,4.057692,1,15,13,0,0,0,0,0
9,"[Whispers, of, the, Wicked, Saints:, This, was...",__label__2,239,63,3.734375,0,8,2,0,0,0,0,0


In [104]:
x='''function to check and get the part of speech tag count of a words in a given sentence'''
wiki = textblob.TextBlob(x)
wiki

TextBlob("function to check and get the part of speech tag count of a words in a given sentence")

In [105]:
wiki.tags

[('function', 'NN'),
 ('to', 'TO'),
 ('check', 'VB'),
 ('and', 'CC'),
 ('get', 'VB'),
 ('the', 'DT'),
 ('part', 'NN'),
 ('of', 'IN'),
 ('speech', 'NN'),
 ('tag', 'NN'),
 ('count', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('words', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('given', 'VBN'),
 ('sentence', 'NN')]

In [116]:
from textblob.translate import Translator
t = Translator()
t.translate('hello', from_lang='en', to_lang='fr')

'Bonjour'

In [119]:
t.detect("hola")

'es'

In [122]:
#Topic Modelling - #Train a LDA model

lda_model = decomposition.LatentDirichletAllocation(n_components=20,learning_method='online', max_iter=20)

X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [123]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
topic_summaries

["memory drivers window inspiring recommended. seeking fate God's Sam Buddy",
 'the and a to I of is this it in',
 'door socks breaking BD Would rule again! presenting individuals happens.',
 'Sony money, Lola enough, taste whom Being Tomb original, Run',
 'DVD: cake mark pulling edges Truth 11 Christmas, Jewish Korean',
 'dialogue herself tree network dialog actors. video, training print. thriller',
 'upon much, Clan Despite sections Auel manual shocked said. directed',
 'installed blade airbed Super leak offered Emma cedar Dear leaks',
 'battery computer charger power laptop adapter cord charge Windows wireless',
 'rice fun. Pretty Catholic character. Classic reason. cooker techno Kate',
 'u Nice instead. World dishes better: collar Jackson selections combat',
 'film young film. of performance John Manson his political cast',
 'Totally cincher right? Small charm. ive praising Metal sea Firewire',
 'beat solid "real" downloading Whitney rock. involving marry Asimov\'s raider',
 'game.

In [124]:
#LDA explanation in python

doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]
doc_complete

['Sugar is bad to consume. My sister likes to have sugar, but not my father.',
 'My father spends a lot of time driving my sister around to dance practice.',
 'Doctors suggest that driving may cause increased stress and blood pressure.',
 'Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.',
 'Health experts say that Sugar is not good for your lifestyle.']

In [145]:
#Cleaning and Preprocessing
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]
doc_clean

[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'],
 ['father',
  'spends',
  'lot',
  'time',
  'driving',
  'sister',
  'around',
  'dance',
  'practice'],
 ['doctor',
  'suggest',
  'driving',
  'may',
  'cause',
  'increased',
  'stress',
  'blood',
  'pressure'],
 ['sometimes',
  'feel',
  'pressure',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seems',
  'drive',
  'sister',
  'better'],
 ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]

In [146]:
# Preparing Document-Term Matrix
import gensim
from gensim import corpora

dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(2, 1),
  (4, 1),
  (18, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(5, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]

In [147]:
# Running LDA Model

Lda = gensim.models.ldamodel.LdaModel

ldamodel = Lda(doc_term_matrix,num_topics=3, id2word = dictionary, passes=50)


In [148]:
#result
ldamodel.print_topics(num_topics=3, num_words=3)

[(0, '0.064*"sugar" + 0.064*"sister" + 0.064*"father"'),
 (1, '0.045*"pressure" + 0.045*"father" + 0.045*"sister"'),
 (2, '0.029*"sugar" + 0.029*"father" + 0.029*"sister"')]

In [149]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    classifier.fit(feature_vector_train,label)
    
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=1)
        
    return metrics.accuracy_score(predictions,valid_y)

In [155]:
#Naive bayes 

#naive bayes on Count Vectors

accuracy = train_model(naive_bayes.MultinomialNB(),xtrain_count,train_y,xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

NB, Count Vectors:  0.8304
NB, WordLevel TF-IDF:  0.8356


In [158]:
#Linear Classifier - Logistic Regression
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, count vectors",accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

LR, count vectors 0.8496
LR, WordLevel TF-IDF:  0.8396


In [165]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, N-Gram Vectors: ", accuracy)


SVM, N-Gram Vectors:  0.5004


In [166]:
#Bagging Model 
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.724
RF, WordLevel TF-IDF:  0.724


In [167]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print( "Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)


  if diff:


Xgb, Count Vectors:  0.7632
Xgb, WordLevel TF-IDF:  0.7684


  if diff:


In [168]:
xtrain_count.tocsc()

<7500x78684 sparse matrix of type '<class 'numpy.int64'>'
	with 459720 stored elements in Compressed Sparse Column format>

In [186]:
#shallow Neural Networks

def create_model_architecture(input_size):
    #create input layer
    input_layer = layers.Input((input_size, ), sparse=True)
    
    #create hidden layer
    hidden_layer = layers.Dense(5000, activation="relu")(input_layer)
    
    #create output layer
    output_layer = layers.Dense(1,activation="sigmoid")(hidden_layer)
    
    classifier = models.Model(inputs=input_layer, outputs=output_layer)
    
    classifier.compile(optimizer=optimizers.Adam(),loss='binary_crossentropy')

    return classifier

classifier = create_model_architecture(xtrain_tfidf.shape[1])
accuracy = train_model(classifier,xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
print ("NN, Ngram Level TF IDF Vectors",  accuracy)

Epoch 1/1
NN, Ngram Level TF IDF Vectors 0.51


In [193]:
#deep neural network

def create_cnn():
    #Add input layer
    input_layer = layers.Input((70,))
    
    #Add the word embedding layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    #Add the convolutional layer
    conv_layer = layers.Convolution1D(100,3, activation="relu")(embedding_layer)
    
    #Add the pooling layer
    pooling_layer = layers.GlobalMaxPooling1D()(conv_layer)
    
    #Add the output layer
    output_layer1 = layers.Dense(50,activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer_2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    
    #compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer_2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier=create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)


Epoch 1/1
CNN, Word Embeddings 0.51


In [194]:
#Recurrent Neural Network – LSTM
def create_rnn_lstm():
    #add an input layer
    input_layer = layers.Input((70,))
    
    #add the word embedding layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    #Add LStM layer
    lstm_layer  = layers.LSTM(100)(embedding_layer)
    
    #Add the output layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    
    #compile the Model
    model = models.Model(inputs=input_layer, outputs = output_layer2)
    model.compile(optimizer=optimizers.Adam(),loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/1
RNN-LSTM, Word Embeddings 0.51


In [195]:
def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_gru()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-GRU, Word Embeddings",  accuracy)

Epoch 1/1
RNN-GRU, Word Embeddings 0.51


In [196]:
def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_bidirectional_rnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-Bidirectional, Word Embeddings",  accuracy)

Epoch 1/1
RNN-Bidirectional, Word Embeddings 0.51


In [197]:
def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rcnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Epoch 1/1
CNN, Word Embeddings 0.51
