In [1]:
# from aifeel.util import gen_dataframe, read_corpus
# from aifeel.util.preprocess import preprocess_text
# from aifeel.util.feature_extraction import extract_features

# negative_corpus, positive_corpus = read_corpus("negative-reviews"), read_corpus(
#     "positive-reviews"
# )
# negative_words, positive_words = read_corpus("negative-words"), read_corpus(
#     "positive-words"
# )

# df = gen_dataframe(positive_corpus, negative_corpus, random_state=42)
# df["clean_review"] = df["review"].apply(preprocess_text)

# # Convert lists to sets
# positive_words = set(read_corpus("positive-words"))
# negative_words = set(read_corpus("negative-words"))

# # Apply feature extraction to each review
# df['features'] = df['clean_review'].apply(lambda review: extract_features(review, positive_words, negative_words))

# print(df["features"].head(5))



In [2]:
from aifeel.util import gen_dataframe, read_corpus
from aifeel.util.preprocess import preprocess_text
from aifeel.util.feature_extraction import extract_features
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Read the corpora and word lists
negative_corpus, positive_corpus = read_corpus("negative-reviews"), read_corpus("positive-reviews")
negative_words, positive_words = set(read_corpus("negative-words")), set(read_corpus("positive-words"))

# Generate the dataframe and preprocess the reviews
df = gen_dataframe(positive_corpus, negative_corpus, random_state=42)
df["clean_review"] = df["review"].apply(preprocess_text)

# Extract features from each review
df['features'] = df['clean_review'].apply(lambda review: extract_features(review, positive_words, negative_words))

# Convert the features to a format that can be used by the models
vec = DictVectorizer()
X = vec.fit_transform(df['features'].tolist()).toarray()  # Changed this line
y = df['tag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and their parameter grids
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier()]
param_grids = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
               {'max_depth': [10, 20, 30, 40, 50]}, 
               {'n_estimators': [10, 50, 100, 200], 'max_depth': [10, 20, 30, 40, 50]}
               ]

# Train and tune the models
for model, param_grid in zip(models, param_grids):
    clf = GridSearchCV(model, param_grid)
    clf.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {clf.best_params_}")


Best parameters for LogisticRegression: {'C': 0.01}
Best parameters for DecisionTreeClassifier: {'max_depth': 10}
Best parameters for RandomForestClassifier: {'max_depth': 10, 'n_estimators': 50}


In [3]:
from sklearn.metrics import classification_report

# Define the models with their best parameters
models_best_params = [LogisticRegression(C=0.01), 
                      DecisionTreeClassifier(max_depth=10), 
                      RandomForestClassifier(max_depth=10, n_estimators=50)]

# Train the models with the best parameters and display the classification report
for model in models_best_params:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Classification report for {model.__class__.__name__}:\n")
    print(classification_report(y_test, y_pred))

Classification report for LogisticRegression:

              precision    recall  f1-score   support

           0       0.80      0.88      0.83      4003
           1       0.86      0.77      0.82      3997

    accuracy                           0.83      8000
   macro avg       0.83      0.83      0.83      8000
weighted avg       0.83      0.83      0.83      8000

Classification report for DecisionTreeClassifier:

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      4003
           1       0.85      0.79      0.82      3997

    accuracy                           0.82      8000
   macro avg       0.83      0.82      0.82      8000
weighted avg       0.83      0.82      0.82      8000

Classification report for RandomForestClassifier:

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      4003
           1       0.85      0.79      0.82      3997

    accuracy                     