<a href="https://colab.research.google.com/github/VISHALGGreat/Machine-Learning-and-Deep-Learning/blob/master/corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Importing the libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [0]:
#Uploading the corpus file
from google.colab import files
uploaded = files.upload()

Saving corpus to corpus (1)


In [0]:
#Load the dataset
data = open('corpus').read()

In [0]:
labels, texts = [], []
data1 = data.split('\n')

In [0]:
for i,line in enumerate(data1):
  content = line.split()
  labels.append(content[0])
  texts.append(" ".join(content[1:]))

In [0]:
# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [0]:
trainDF['label']

0       __label__2
1       __label__2
2       __label__2
3       __label__2
4       __label__2
5       __label__2
6       __label__1
7       __label__2
8       __label__2
9       __label__2
10      __label__1
11      __label__2
12      __label__2
13      __label__1
14      __label__1
15      __label__1
16      __label__2
17      __label__2
18      __label__2
19      __label__1
20      __label__1
21      __label__2
22      __label__1
23      __label__2
24      __label__2
25      __label__1
26      __label__1
27      __label__1
28      __label__1
29      __label__2
           ...    
9970    __label__1
9971    __label__1
9972    __label__1
9973    __label__2
9974    __label__1
9975    __label__1
9976    __label__2
9977    __label__2
9978    __label__2
9979    __label__2
9980    __label__2
9981    __label__2
9982    __label__1
9983    __label__2
9984    __label__1
9985    __label__2
9986    __label__2
9987    __label__2
9988    __label__2
9989    __label__2
9990    __label__2
9991    __la

In [0]:
# split the dataset into training and validation datasets     valid_x default=0.25
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

In [0]:
train_x

2414    A Frankenstein for our Generation: This is my ...
3419    Fables from India Well Presented.: Pancatantra...
7850    Is Musiq losing his touch?: This by far was th...
4483    Bone chilling outdated views: I was just about...
5725    Riveting: One of the best suspense books I hav...
8030    Kyra's Story: This series is about the guilt t...
5982    One of the best stories ever: I recomend this ...
4559    By a true scientist!: An exciting science fict...
5342    Couldn't Watch: I have been looking for this m...
5832    AWESOME: i bought for my dad so he doesn't hav...
3952    Doesn't work in anything I put it in.: I have ...
1809    The worst of all the Future sounds of jazz ser...
2783    helps in understanding children: For a long ti...
3982    Not pleased with Express delivery!: My review ...
4672    A significant change from Noble House Saga: A ...
472     Worst English dub of a great TV series: This s...
476     wrong item: I was going to get a book with a I...
9772    Great 

In [0]:
# label encode the target variable 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [0]:
valid_y

array([1, 0, 0, ..., 1, 0, 1])

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [0]:
xvalid_count

<2500x31666 sparse matrix of type '<class 'numpy.int64'>'
	with 146860 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [0]:
# utility function used to train a model
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [0]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8428
NB, WordLevel TF-IDF:  0.852
NB, N-Gram Vectors:  0.8376
NB, CharLevel Vectors:  0.8244


In [0]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)



LR, Count Vectors:  0.8596
LR, WordLevel TF-IDF:  0.8684
LR, N-Gram Vectors:  0.836
LR, CharLevel Vectors:  0.8504


In [0]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)



SVM, N-Gram Vectors:  0.5284


In [0]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)



RF, Count Vectors:  0.7564




RF, WordLevel TF-IDF:  0.7708


In [0]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.7944
Xgb, WordLevel TF-IDF:  0.794
Xgb, CharLevel Vectors:  0.8052


In [0]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print ("NN, Ngram Level TF IDF Vectors",  accuracy)

Epoch 1/1
NN, Ngram Level TF IDF Vectors 0.5284
