In [29]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [30]:
with open('preprocess_data.pkl', mode='rb') as file:
    preprocessed_data = pickle.load(file)


In [31]:
feature = preprocessed_data['lemma_cleaned_text']
target = preprocessed_data['Sentiment']

In [32]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.3)

In [33]:
pipeline = Pipeline(
    [
        ("vectorizer", CountVectorizer(ngram_range=(1,2))),
        ('classifier', MultinomialNB)
    ]
)

In [34]:
param_grid = {
    "vectorizer" : [CountVectorizer(), TfidfVectorizer()],
    "vectorizer__ngram_range":[(1,1), (1,2)],
    "vectorizer__max_features":[None, 100, 500],
    "classifier":[MultinomialNB(), LogisticRegression()]
}

In [40]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring="accuracy", verbose= 2, n_jobs=-1)

In [41]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'classifier': LogisticRegression(), 'vectorizer': CountVectorizer(), 'vectorizer__max_features': None, 'vectorizer__ngram_range': (1, 2)}
Best Accuracy: 0.9070399341292713


In [43]:
model = grid_search.best_estimator_
print(model)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('classifier', LogisticRegression())])


###Evaluate best model

In [48]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [49]:
print(report)

              precision    recall  f1-score   support

    negative       0.90      0.89      0.90     24211
     neutral       0.84      0.80      0.82     17590
    positive       0.94      0.97      0.95     51890

    accuracy                           0.92     93691
   macro avg       0.90      0.88      0.89     93691
weighted avg       0.91      0.92      0.91     93691



In [51]:
sample =['i hate your product with my heart']
predictions = model.predict(sample)
predictions

array(['negative'], dtype=object)

##save model

In [52]:
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']

#### Using countvectorizer to create my  word embeddings