In [1]:
from aifeel.util import gen_dataframe, read_corpus
from aifeel.util.preprocess import preprocess_text

# Read the corpora and word lists
negative_corpus, positive_corpus = read_corpus("negative-reviews"), read_corpus("positive-reviews")
negative_words, positive_words = set(read_corpus("negative-words")), set(read_corpus("positive-words"))

# Generate the dataframe and preprocess the reviews
df = gen_dataframe(positive_corpus, negative_corpus, random_state=42)
df["clean_review"] = df["review"].apply(preprocess_text)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the cleaned reviews
X = vectorizer.fit_transform(df['clean_review'])
y = df['tag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)





In [3]:
# # Define the hyperparameters
# hyperparameters = {
#     "LogisticRegression": {"C": [0.1, 1, 10, 100]},
#     "DecisionTreeClassifier": {"max_depth": [5, 10, 15, 20]},
#     "RandomForestClassifier": {"n_estimators": [100, 200, 300], "max_depth": [10, 20, 30]},
#     "MultinomialNB": {"alpha": [0.1, 0.5, 1]}
# }

# # Define the models
# models = {
#     "LogisticRegression": LogisticRegression(),
#     "DecisionTreeClassifier": DecisionTreeClassifier(),
#     "RandomForestClassifier": RandomForestClassifier(),
#     "MultinomialNB": MultinomialNB()
# }

# # Perform hyperparameter tuning for each model
# for model_name, model in models.items():
#     clf = GridSearchCV(model, hyperparameters[model_name], cv=5)
#     clf.fit(X_train, y_train)
#     print(f"Best parameters for {model_name}: {clf.best_params_}")
#     y_pred = clf.predict(X_test)
#     print(f"Classification report for {model_name}:\n")
#     print(classification_report(y_test, y_pred))


best paramter: 
Best parameters for LogisticRegression: {'C': 10} <br>
Best parameters for DecisionTreeClassifier: {'max_depth': 20} <br>
Best parameters for RandomForestClassifier: {'max_depth': 30, 'n_estimators': 300} <br>


In [4]:
models = {
    "LogisticRegression": LogisticRegression(C=10),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=20),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=300, max_depth=30),
    "MultinomialNB": MultinomialNB(alpha=1)  # Assuming alpha=1 is the best parameter for MultinomialNB
}

# Train the models and generate a classification report for each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n")
    print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification report for LogisticRegression:

              precision    recall  f1-score   support

           0       0.90      0.91      0.91      4003
           1       0.91      0.90      0.91      3997

    accuracy                           0.91      8000
   macro avg       0.91      0.91      0.91      8000
weighted avg       0.91      0.91      0.91      8000

Classification report for DecisionTreeClassifier:

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      4003
           1       0.88      0.78      0.82      3997

    accuracy                           0.83      8000
   macro avg       0.84      0.83      0.83      8000
weighted avg       0.84      0.83      0.83      8000

Classification report for RandomForestClassifier:

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4003
           1       0.92      0.82      0.87      3997

    accuracy                     

In [6]:


def predict_review(model, reviews):
    # Transform the reviews to tf-idf vectors
    X = vectorizer.transform(reviews)
    # Predict the sentiment
    y_pred = model.predict(X)
    # Get the probabilities
    y_prob = model.predict_proba(X)
    # Print the predictions and probabilities
    for review, pred, prob in zip(reviews, y_pred, y_prob):
        sentiment = 'Positive' if pred == "1" else 'Negative'
        print(f"Review: {review} ({sentiment})\nProbability (Negative, Positive): {prob}\n")

In [12]:
reviews = ["lol bro that's awesome, where can i buy that shoes, tell me where bro ?"]
# Predict the sentiment and probabilities for each model
for model_name, model in models.items():
    print(f"{model_name} Predictions:")
    predict_review(model, reviews)

LogisticRegression Predictions:
Review: lol bro that's awesome, where can i buy that shoes, tell me where bro ? (Negative)
Probability (Negative, Positive): [0.89802956 0.10197044]

DecisionTreeClassifier Predictions:
Review: lol bro that's awesome, where can i buy that shoes, tell me where bro ? (Negative)
Probability (Negative, Positive): [0.76603483 0.23396517]

RandomForestClassifier Predictions:
Review: lol bro that's awesome, where can i buy that shoes, tell me where bro ? (Negative)
Probability (Negative, Positive): [0.52054229 0.47945771]

MultinomialNB Predictions:
Review: lol bro that's awesome, where can i buy that shoes, tell me where bro ? (Negative)
Probability (Negative, Positive): [0.74515047 0.25484953]

