In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df_stem = pd.read_csv("df_stem.csv")
df_lemma = pd.read_csv("df_lemma.csv")

In [4]:
df_stem.sentiment.value_counts()

sentiment
Negative      22312
Positive      20619
Neutral       18051
Irrelevant    12842
Name: count, dtype: int64

In [7]:
label_map = { "Irrelevant": 0, "Positive" : 1, "Negative" : 2, "Neutral": 3}

In [9]:
df_stem["sentiment"] = df_stem["sentiment"].map(label_map)

In [11]:
df_stem.sentiment.value_counts()

sentiment
2    22312
1    20619
3    18051
0    12842
Name: count, dtype: int64

In [13]:
X_stem = df_stem["text"]
y_stem =df_stem["sentiment"]
X_stem = X_stem.astype(str)

In [15]:
X_train_stem,X_test_stem,y_train_stem,y_test_stem = train_test_split(X_stem,y_stem,test_size = 0.2,random_state=1)

In [17]:
df_lemma["sentiment"] = df_lemma["sentiment"].map(label_map)

In [93]:
pipeline_bow_logistic_regression = Pipeline([
    ('bow_vectorizer', CountVectorizer()),
    ('logistic_regression', LogisticRegression(max_iter=10000))])

In [94]:
pipeline_tfidf_logistic_regression =  Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('logistic_regression', LogisticRegression(max_iter=10000))])

In [23]:
param_grid_bow = {
    "bow_vectorizer__ngram_range": [(1,1), (1,2), (2,2,), (1,3)]
}

In [25]:
param_grid_tfidf = {
    "tfidf_vectorizer__ngram_range": [(1,1), (1,2), (2,2,), (1,3)]
}

In [27]:
grid = GridSearchCV(pipeline_bow_logistic_regression,param_grid_bow,cv=5,scoring = "accuracy")
grid.fit(X_train_stem,y_train_stem)

In [33]:
grid.best_params_

{'bow_vectorizer__ngram_range': (1, 1)}

In [35]:
y_pred=grid.predict(X_test_stem)

In [37]:
accuracy = accuracy_score(y_pred,y_test_stem)

In [39]:
accuracy

0.5939045038943447

In [None]:
#The best ngram is (1,1) ie unigram

In [84]:
pipeline_bow_randomforest = Pipeline([
    ('bow_vectorizer', CountVectorizer()),
    ('randomforest', RandomForestClassifier())])

pipeline_bow_multinomial = Pipeline([
    ('bow_vectorizer', CountVectorizer()),
    ('randomforest', MultinomialNB())])

pipeline_tfidf_randomforest = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('multinomial', RandomForestClassifier())])

pipeline_tfidf_multinomial = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('multinomial', MultinomialNB())])

1. Training on Stemming

In [95]:
pipeline_tfidf_logistic_regression.fit(X_train_stem,y_train_stem)
y_pred_stem_tfidf_logistic=pipeline_tfidf_logistic_regression.predict(X_test_stem)
accuracy_score(y_test_stem,y_pred_stem_tfidf_logistic)

0.7800203183203522

In [88]:
pipeline_bow_randomforest.fit(X_train_stem,y_train_stem)
y_pred_stem_bow_randomforest=pipeline_bow_randomforest.predict(X_test_stem)
accuracy_score(y_test_stem,y_pred_stem_bow_randomforest)

0.9194039959363359

In [89]:
pipeline_bow_multinomial.fit(X_train_stem,y_train_stem)
y_pred_stem_bow_multinomial=pipeline_bow_multinomial.predict(X_test_stem)
accuracy_score(y_test_stem,y_pred_stem_bow_multinomial)

0.7391127666779547

In [90]:
pipeline_tfidf_randomforest.fit(X_train_stem,y_train_stem)
y_pred_stem_tfidf_randomforest=pipeline_tfidf_randomforest.predict(X_test_stem)
accuracy_score(y_test_stem,y_pred_stem_tfidf_randomforest)

0.9235353877412801

In [91]:
pipeline_tfidf_multinomial.fit(X_train_stem,y_train_stem)
y_pred_stem_tfidf_multinomial=pipeline_tfidf_multinomial.predict(X_test_stem)
accuracy_score(y_test_stem,y_pred_stem_tfidf_multinomial)

0.7263122248560786

2. Now Training in Lemmatized text

In [63]:
X_lemma = df_lemma["text"]
y_lemma =df_lemma["sentiment"]
X_lemma = X_lemma.astype(str)

In [65]:
X_train_lemma,X_test_lemma,y_train_lemma,y_test_lemma=train_test_split(X_lemma,y_lemma,test_size=0.2,random_state=1)

In [96]:
pipeline_bow_logistic_regression.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_bow_logistic=pipeline_bow_logistic_regression.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_bow_logistic)

0.8417880121909922

In [97]:
pipeline_tfidf_logistic_regression.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_tfidf_logistic=pipeline_tfidf_logistic_regression.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_tfidf_logistic)

0.7967490687436505

In [98]:
pipeline_bow_randomforest.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_bow_randomforest=pipeline_bow_randomforest.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_bow_randomforest)

0.9177785303081611

In [99]:
pipeline_bow_multinomial.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_bow_multinomial=pipeline_bow_multinomial.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_bow_multinomial)

0.7565187944463257

In [100]:
pipeline_tfidf_randomforest.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_tfidf_randomforest=pipeline_tfidf_randomforest.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_tfidf_randomforest)

0.9210294615645107

In [101]:
pipeline_tfidf_multinomial.fit(X_train_lemma,y_train_lemma)
y_pred_lemma_tfidf_multinomial=pipeline_tfidf_multinomial.predict(X_test_lemma)
accuracy_score(y_test_lemma,y_pred_lemma_tfidf_multinomial)

0.7383677615983746

the best model is stem , tfidf, randomforest

In [118]:
print(classification_report(y_test_stem, y_pred_stem_tfidf_randomforest))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92      2593
           1       0.88      0.94      0.91      4175
           2       0.92      0.95      0.94      4437
           3       0.94      0.91      0.93      3560

    accuracy                           0.92     14765
   macro avg       0.93      0.92      0.92     14765
weighted avg       0.93      0.92      0.92     14765

