In [33]:
from sklearn.linear_model import LogisticRegression
import importlib
import ClassifierDataPrepper
importlib.reload(ClassifierDataPrepper)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics, preprocessing
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
import pandas as pd

dataPath = "./"
trainingDataPath = dataPath + "train/"
positiveTrainingDataPath = trainingDataPath + "pos/"
negativeTrainingDataPath = trainingDataPath + "neg/"
# testDataPath = dataPath + "test/"
# positiveTrainingDataPath = trainingDataPath + "pos_small/"
# negativeTrainingDataPath = trainingDataPath + "neg_small/"
testDataPath = None
print("Opening training and test files...")
cdp = ClassifierDataPrepper.ClassifierDataPrepper(positiveTrainingDataPath, negativeTrainingDataPath, testDataPath)

print("Preparing data frames...")
X, Y = cdp.getXYlabeled()
if testDataPath is not None:
    Z = cdp.getXtest()
print("Splitting comments into words and removing html tags...")
X_words = []
for comment in X:
    comment = cdp.cleanhtml(comment)
    comment = comment.lower().split(" ")
    X_words.append(comment)

if testDataPath is not None:    
    Z_words = []
    for comment in Z:
        comment = cdp.cleanhtml(comment)
        comment = comment.lower().split(" ")
        Z_words.append(comment)
    
Y_words = []
for label in Y:
    Y_words.append(label)
    
print(len(X_words))
print(len(Y_words))


Opening training and test files...




Preparing data frames...




Splitting comments into words and removing html tags...




20




20




In [34]:
print("training doc2vec...")
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = []
idx = 0
for comment in X_words:
    documents.append(TaggedDocument(comment, str(Y_words[idx])))
    idx += 1

# print(documents)
model = Doc2Vec(vector_size=400, min_count=2, epochs=40, workers=8)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vectorizedComments = []
for comment in X_words:
    vectorizedComments.append(model.infer_vector(comment))


training doc2vec...




In [35]:
print("Extracting features from data frames...")
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp_spacy = spacy.load('en', disable=['parser', 'ner'])
X_lem = []
idx = 0
for comment in X:
    comment = cdp.cleanhtml(comment)
    sentences = nltk.sent_tokenize(comment)  # this gives us a list of sentences
    # Parse the sentence using the loaded 'en' model object `nlp`
    comment_lem = []
    for sentence in sentences:
        sentence_lem_tokens = nlp_spacy(sentence)
        sentence_lem = " ".join([token.lemma_ for token in sentence_lem_tokens])
        comment_lem.append(sentence_lem)

    comment_lem = " ".join(comment_lem)
    # print("Comment:")
    # print(comment)
    # print("Lemmatized Comment:")
    # print(comment_lem)
    X_lem.append(comment_lem)
    idx += 1
    if not idx % 2000:
        print("processed {} comments".format(idx))


Extracting features from data frames...




In [42]:
# vect = CountVectorizer(min_df=1, ngram_range=(1, 2), binary=False)
# vect = CountVectorizer(ngram_range=(1, 2))
vect = TfidfVectorizer(min_df=5, ngram_range=(1, 4))
# vect = TfidfVectorizer(min_df=1, ngram_range=(1, 2))

# learn the vocabularies from training data for each vector type
vect.fit(X_lem)

# transform training data
X_lem_token = vect.transform(X_lem)

# normalize the data (scale it down to 0 -> 1)
X_lem_token = preprocessing.normalize(X_lem_token, norm='l2')

# model = MLPClassifier(solver='sgd', alpha=1e-5,
#                       hidden_layer_sizes=(5, 2), random_state=1)

print(X_lem_token.shape)

(20, 87)




In [37]:
idx = 0
lem_doc2vec = []
for features in vectorizedComments:
    merged = features.tolist()
    merged += X_lem_token[idx].todense().tolist()[0]        
    lem_doc2vec.append(merged)
    idx += 1

print("Finished merging docs")

Finished merging docs




In [38]:
print(len(lem_doc2vec))
print(len(lem_doc2vec[0]))

20




481




In [39]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [.1, 1, 10, 100]}

grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(lem_doc2vec, Y_words)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)











































Best cross-validation score: 0.70




Best parameters: 

 

{'C': 1}




In [40]:
# print("Vectorizing test set")
# vectorizedComments_Z = []
# for comment in Z_words:
#     vectorizedComments_Z.append(model.infer_vector(comment))


In [41]:
# import time
# print("Running Model on new data")
# predictions = grid.best_estimator_.predict(vectorizedComments_Z)
# 
# f = open("predictions_" + str(int(time.time())) + ".csv", "w")
# f.write("Id,Category\n")
# i = 0
# for prediction in predictions:
#     f.write(str(i) + "," + str(int(prediction)))
#     f.write("\n")
#     i += 1
# f.close()
