In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dill as model_file
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import nltk

from aifeel.util.preprocess import preprocess_text
from aifeel.util.feature_extraction import extract_features, feature_to_vector
from aifeel.util import gen_dataframe, read_corpus
from aifeel.model.nn import NNClassifier
from scipy.sparse import hstack
from sklearn.metrics import classification_report




In [3]:
# Load the model from the file
with open("export/model/NNClassifier/nn_model_for_challenge.dill", "rb") as f:
    loaded_model = model_file.load(f)

with open("export/model/SVM/svm_model.dill", "rb") as f:
    svm_model = model_file.load(f)

with open("export/model/TFIDFModelClassifier/multinomial_nb_model.dill", "rb") as f:
    multinomial_nb_model = model_file.load(f)


# load NNClassification vectorizer
with open("export/model/NNClassifier/vectorizer.dill", "rb") as f:
    cv = model_file.load(f)

# load multiNB vectorizer
vectorizer_for_multi = model_file.load(
    open("export/model/TFIDFModelClassifier/vectorizer.dill", "rb")
)





https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
negative_words, positive_words = set(read_corpus("negative-words")), set(
    read_corpus("positive-words")
)

In [5]:
def vectorizer(review):
    result = cv.transform([review])
    return result.toarray()[0].tolist()

def count_sentiment_words(review, sentiment_words):
    words = review.split()
    return sum(1 for word in words if word in sentiment_words)

In [6]:
def save_result_challenge(arr, filename):
    arr_string = ''.join(map(str, arr))
    # Write the string to the file
    with open(filename, 'w') as file:
        file.write(arr_string)

In [7]:
def convet_to_vector(clean_corpus):
    features = map(lambda t: extract_features(t, positive_words, negative_words, vectorizer=vectorizer), clean_corpus)
    feature_vectors = list(map(lambda f: feature_to_vector(f, vectorizer=True), features))
    return feature_vectors

In [8]:
reviews = read_corpus("challenge_data.txt")

In [9]:
type(reviews)

list

In [18]:
def predict_review_nn(model, reviews):
    test_reviews = [preprocess_text(review) for review in reviews]
    test_feature = [extract_features(review, positive_words, negative_words, vectorizer=vectorizer) for review in test_reviews]
    test_feature_vector = np.array([feature_to_vector(review, vectorizer=True) for review in test_feature])
    y_pred = model.predict(test_feature_vector)
    return y_pred

result_nn = predict_review_nn(loaded_model, reviews)
result_nn



array([0, 1, 1, ..., 1, 1, 0])

In [23]:
result_nn[0:10]
#save_result_challenge(result_nn, "challenge_result/result_nn.txt")

In [13]:
def predict_review_svm(model, reviews):
    # Preprocess and extract features (assuming preprocess_text and extract_features return strings)
    test_review_preprocess = [preprocess_text(review) for review in reviews]
    test_review_feature = [extract_features(review, positive_words, negative_words) for review in test_review_preprocess]
    test_review_df = pd.DataFrame(test_review_feature)
    #print(test_review_df.head(10))
    y_pred = model.predict(test_review_df)
    #print(y_pred[0:10])
    return y_pred

# review_numpy = np.array(reviews)
results_svm = predict_review_svm(svm_model, reviews)

In [15]:
results_svm.shape

(5868,)

In [34]:
def predict_review(model, reviews):
    X_tfidf = vectorizer_for_multi.transform(reviews)

    positive_word_count = [
        count_sentiment_words(review, positive_words) for review in reviews
    ]
    negative_word_count = [
        count_sentiment_words(review, negative_words) for review in reviews
    ]

    # Combine the tf-idf features with the sentiment word count features
    X = hstack([X_tfidf, np.array([positive_word_count, negative_word_count]).T])

    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)

    return y_pred, y_prob

result_multiNB = predict_review(multinomial_nb_model, reviews)
result_multiNB[0]

array(['0', '1', '1', ..., '1', '1', '0'], dtype='<U1')

In [None]:
#save result to file
save_result_challenge(result_nn, "challenge_result/result_nn.txt")
save_result_challenge(result_multiNB[0], "challenge_result/result_multiNb.txt")
save_result_challenge(results_svm, "challenge_result/result_svm.txt")

In [44]:
print(result_nn)
print(result_multiNB[0].astype(int))
report = classification_report(result_nn, result_multiNB[0].astype(int))
print(report)

In [46]:
print(result_multiNB[0].shape)