In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import matplotlib.pyplot as plt
import numpy as np
from rich import print
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from aifeel.model.nn import NNClassifier
from aifeel.util import gen_dataframe, read_corpus
from aifeel.util.feature_extraction import extract_features, feature_to_vector
from aifeel.util.preprocess import preprocess_text
from sklearn.linear_model import LogisticRegression

In [31]:
negative_corpus, positive_corpus = read_corpus("negative-reviews"), read_corpus(
    "positive-reviews"
)
negative_words, positive_words = set(read_corpus("negative-words")), set(
    read_corpus("positive-words")
)

df = gen_dataframe(positive_corpus, negative_corpus, random_state=42)
df["clean_review"] = df["review"].apply(preprocess_text)

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the cleaned reviews
X = vectorizer.fit_transform(df['clean_review'])
y = df['tag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

model = LogisticRegression(C=0.01)
model.fit(X_train, y_train)

In [32]:
# save model
from joblib import dump

# Assuming svm_model is your trained SVM model
# Replace 'svm_model.joblib' with the desired file name/path
dump(model, 'lr.joblib')
dump(vectorizer, 'tf-idf.joblib')

['tf-idf.joblib']

In [33]:
from joblib import load

# Load the saved model
loaded_model = load('lr.joblib')
vectorize = load('tf-idf.joblib')

In [36]:
def classify_reviews(file_name, output_file_name = 'result_testing_challenge.txt'):
    # read challenge file
    reviews = read_corpus(file_name)
    # preprocess text
    clean_reviews = [preprocess_text(review) for review in reviews]
    # initailize feature extraction technique
    X = vectorize.transform(clean_reviews)

    predictions = loaded_model.predict(X)
    print(predictions)

    with open(output_file_name, 'w') as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction}\n")


    return clean_reviews
classify_reviews('testing-challeng')

['absolutely wonderful silky sexy comfortable',
 'love dress ! sooo pretty i happened find store i glad i bc i never would ordered online bc petite i bought petite 5 8 i love length me hit little knee would definitely true midi someone truly petite',
 'i high hope dress really wanted work me i initially ordered petite small my usual size i found outrageously small small fact i could NOT_zip NOT_it NOT_up ! i reordered petite medium ok overall top half comfortable fit nicely bottom half tight layer several somewhat cheap net layer imo major design flaw net layer sewn directly zipper c',
 'i love love love jumpsuit fun flirty fabulous ! every time i wear i get nothing great compliment !',
 'shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt ! ! !',
 'i love tracy reese dress one NOT_for NOT_the NOT_very NOT_petite NOT_i NOT_am NOT_just NOT_under NOT_5 NOT_feet NOT_tall NOT_and NOT_usually NOT_wear NOT_a NOT_0p NOT_in NOT_this NO