In [1]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from aifeel.util import gen_dataframe, read_corpus
from aifeel.util.preprocess import preprocess_text

# Read the corpora and word lists
negative_corpus, positive_corpus = read_corpus("negative-reviews"), read_corpus("positive-reviews")
negative_words, positive_words = set(read_corpus("negative-words")), set(read_corpus("positive-words"))

# Generate the dataframe and preprocess the reviews
df = gen_dataframe(positive_corpus, negative_corpus, random_state=42)
df["clean_review"] = df["review"].apply(preprocess_text)

In [3]:
df.columns

Index(['tag', 'review', 'clean_review'], dtype='object')

In [4]:
df['clean_review']

0                                                     none
1        color internet ringtones camera w flash charge...
2        small screen low battery life upgraded game me...
3                         near photo perfect print quality
4                                       short battery life
                               ...                        
39995                             view screen pocket sized
39996    style speaker phone voice recorder two way dir...
39997                                                 none
39998                 image quality good zoom good battery
39999                           price good quality picture
Name: clean_review, Length: 40000, dtype: object

In [5]:
import gensim
from gensim.utils import simple_preprocess
import numpy as np
from sklearn.model_selection import train_test_split


  "class": algorithms.Blowfish,


In [6]:
words = [simple_preprocess(review) for review in df['clean_review']]

In [7]:
w2v_model = gensim.models.Word2Vec(words, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
def avg_word2vec(doc, model):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.index_to_key]
    
    if len(doc)==0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(model.wv[doc], axis=0)

In [9]:
df['avg_word2vec'] = df['clean_review'].apply(lambda review: avg_word2vec(simple_preprocess(review), w2v_model))

In [10]:
y = df['tag']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['avg_word2vec'].tolist(), y, test_size=0.20, random_state=42)

In [12]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier

# # Define the models and their parameter grids
# models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier()]
# param_grids = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
#                {'max_depth': [10, 20, 30, 40, 50]}, 
#                {'n_estimators': [10, 50, 100, 200], 'max_depth': [10, 20, 30, 40, 50]}
#               ]

# # Train and tune the models
# for model, param_grid in zip(models, param_grids):
#     clf = GridSearchCV(model, param_grid, cv=5)
#     clf.fit(X_train, y_train)
#     print(f"Best parameters for {model.__class__.__name__}: {clf.best_params_}")


Best parametor :
logist regression: 100 <br>
Dt: Max_depth: 10 <br>
RandomForestClassifier: {'max_depth': 30, 'n_estimators': 200} <br>

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [14]:
def train_and_evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, predictions))

In [15]:
models = {
    "Logistic Regression": LogisticRegression(C=100),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=30),
    "SVC": SVC(probability=True),
}

In [16]:
for model_name, model in models.items():
    train_and_evaluate_model(model, model_name, X_train, y_train, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4003
           1       0.88      0.86      0.87      3997

    accuracy                           0.87      8000
   macro avg       0.88      0.87      0.87      8000
weighted avg       0.88      0.87      0.87      8000

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4003
           1       0.86      0.86      0.86      3997

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4003
           1       0.90      0.88      0.89      3997

    accuracy                           0.89      8000
   macro av

In [24]:
def predict_review(model, reviews, w2v_model):
    clean_reviews = [preprocess_text(review) for review in reviews]
    word_vectors = [avg_word2vec(simple_preprocess(review), w2v_model) for review in clean_reviews]
    predictions = model.predict(word_vectors)
    probabilities = model.predict_proba(word_vectors)
    print(clean_reviews)
    for review, pred, prob in zip(reviews, predictions, probabilities):
        sentiment = 'Positive' if pred == "1" else 'Negative'
        print(f"Review: {review} ({sentiment})\nProbability (Negative, Positive): {prob}\n")

In [27]:
reviews = ["lol bro that's awesome, where can i buy that shoes ?"]

# Predict the probabilities for each model
for model_name, model in models.items():
    print(f"{model_name} Predictions:")
    predict_review(model, reviews, w2v_model)

Logistic Regression Predictions:
['lol bro awesome i buy shoe']
Review: lol bro that's awesome, where can i buy that shoes ? (Negative)
Probability (Negative, Positive): [0.62927843 0.37072157]

Decision Tree Predictions:
['lol bro awesome i buy shoe']
Review: lol bro that's awesome, where can i buy that shoes ? (Negative)
Probability (Negative, Positive): [0.79518072 0.20481928]

Random Forest Predictions:
['lol bro awesome i buy shoe']
Review: lol bro that's awesome, where can i buy that shoes ? (Negative)
Probability (Negative, Positive): [0.625 0.375]

SVC Predictions:
['lol bro awesome i buy shoe']
Review: lol bro that's awesome, where can i buy that shoes ? (Negative)
Probability (Negative, Positive): [0.69680068 0.30319932]

