In [15]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.preprocessing import LabelEncoder

import pandas as pd, xgboost, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [16]:
data = pd.read_csv("./data.csv")
data.columns = ["URL", "Text", "Start-Indices", "End-Indices", "Text-Type"]
data.head()

Unnamed: 0,URL,Text,Start-Indices,End-Indices,Text-Type
0,https://stackoverflow.com/questions/37958781,So does that mean it is better than the defaul...,[],[],0
1,https://stackoverflow.com/questions/37958781,It is fundamentally a heuristic based approach...,[],[],0
2,https://stackoverflow.com/questions/37958781,Calling it a heuristic approach is not meant t...,[],[],0
3,https://stackoverflow.com/questions/37958781,"The text in question was Moby Dick, and the od...",[],[],0
4,https://stackoverflow.com/questions/37958783,A table containing only debit and credit colum...,[],[],0


In [17]:
import csv
import nltk
import re
from nltk.corpus import wordnet
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [18]:
trainDF = pd.DataFrame()
trainDF['text'] = data["Text"]
trainDF['label'] = data["Text-Type"]

In [19]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

In [20]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [21]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [22]:
import tensorflow as tf
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    #################################################################################
    # Before fitting the classifier
#     feature_vector_train_reordered = tf.sparse.reorder(feature_vector_train)
#     feature_vector_valid_reordered = tf.sparse.reorder(feature_vector_valid)

#     # Inside the train_model function
#     classifier.fit(feature_vector_train_reordered, label)

#     # Predict the labels on validation dataset
#     predictions = classifier.predict(feature_vector_valid_reordered)
    #################################################################################
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(max_iter=1000), xtrain_count, train_y, xvalid_count)
print(accuracy)

In [None]:
################################################################################################################################
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r"(?u)\b\w\w+\b", ngram_range=(2,3), max_features=5000)
# tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [None]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

# classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])

# accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
# print ("NN, Ngram Level TF IDF Vectors",accuracy)

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train1, y_test = train_test_split(xtrain_tfidf_ngram, train_y, test_size=0.20)
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(X_train, y_train1)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

In [None]:
###############################################################################################################################
# import sys
# !{sys.executable} -m pip install gensim

In [None]:
# import gensim

# sentence_corpus=[]
# for sentence in trainDF["text"]:
#     sentence_corpus.append(nltk.word_tokenize(sentence))
# print(sentence_corpus[0])

# model = gensim.models.Word2Vec(sentence_corpus, min_count=1,vector_size=300,workers=4)