## Building Model Pipelines for Sentiment Analysis using other Classification algorithms

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import bs4
import nltk.corpus  
from nltk.text import Text
import spacy
from pandarallel import pandarallel
import multiprocessing
import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
widgets.IntSlider()

pd.set_option('max_colwidth', 200)

import sklearn
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load

In [2]:
pandarallel.initialize(nb_workers = 60, use_memory_fs = False)

INFO: Pandarallel will run on 60 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
%%time
df_sent = pd.read_parquet('data_to_sentiment.parquet')
df_sent.head(2)

CPU times: user 40.3 s, sys: 10.4 s, total: 50.7 s
Wall time: 42.2 s


Unnamed: 0,date,month,year,quarter,day,dayofwk,title,text,title_clean,text_clean,title_tokens,text_tokens,title_len,text_len
0,2021-03-18,3,2021,2021Q1,18,3,Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online,\n\nArtificial intelligence improves parking efficiency in Chinese cities - People's Daily Online\n\nHome\nChina Politics\nForeign Affairs\nOpinions\nVideo: We Are China\nBusiness\nMilitary\nWorld...,Artificial intelligence improves parking efficiency Chinese cities Peoples Daily Online,Artificial intelligence improves parking efficiency Chinese cities Peoples Daily Online Home China Politics Foreign Affairs Opinions Video We Are China Business Military World Society Culture Trav...,"[Artificial, intelligence, improves, parking, efficiency, Chinese, cities, Peoples, Daily, Online]","[Artificial, intelligence, improves, parking, efficiency, Chinese, cities, Peoples, Daily, Online, Home, China, Politics, Foreign, Affairs, Opinions, Video, We, Are, China, Business, Military, Wor...",10,694
1,2020-02-27,2,2020,2020Q1,27,3,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament,"\nChildren With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament\n \n\nSkip to content\n\t\t\tThursday, February 27, 2020\t\t\n\nLatest:\n\n\n...",Children With Autism Saw Their Learning Social Skills Boosted After Playing With This AI Robot News Parliament,"Children With Autism Saw Their Learning Social Skills Boosted After Playing With This AI Robot News Parliament Skip Thursday , February 27 , 2020 Latest Mansplaining conferences How forestall Drax...","[Children, With, Autism, Saw, Their, Learning, Social, Skills, Boosted, After, Playing, With, This, AI, Robot, News, Parliament]","[Children, With, Autism, Saw, Their, Learning, Social, Skills, Boosted, After, Playing, With, This, AI, Robot, News, Parliament, Skip, content, Thursday, ,, February, 27, ,, 2020, Latest, Mansplai...",17,1194


### Load the Data

In [5]:
os.makedirs('/home/jupyter/yelp', exist_ok=True)
os.makedirs('/home/jupyter/data/yelp/yelp_model', exist_ok=True)

In [6]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName

In [7]:
%%time

yelp = pd.read_json(path, orient='records', lines=True)
yelp.shape

CPU times: user 1.6 s, sys: 568 ms, total: 2.17 s
Wall time: 3.67 s


(255717, 3)

In [8]:
yelp.head(5)

Unnamed: 0,text,label,lang
0,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and the spicy dipping sa...",1,en
1,I love the classes at this gym. Zumba and. Radio Hip Hop are my favorite. This is such a great fun and I love that it is so reasonably priced!,1,en
2,The tables and floor were dirty. I was the only customer on a Saturday nite and the person working the counter ignored me I had a corned beef sandwich. I took three bites and threw it in the trash,0,en
3,"I had an oil change at the 15515 N Scottsdale Road location. When the car was delivered to me, there were two engine warning lights on that had not been on when I drove the car in. The technicia...",0,en
4,The absolute WORST apartment complex I have ever lived in. Moved here from out of state. Hoped to find a decently priced apartment until I got myself settled in. Wow this place has been trash. Lan...,0,en


In [9]:
# define X and y
X = yelp['text']
y = yelp['label']
print(X.shape)
print(y.shape)

(255717,)
(255717,)


In [10]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(191787,)
(63930,)
(191787,)
(63930,)


## Creating sklearn pipelines

### Naive Bayes Model

In [11]:
pipe_nb = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    MultinomialNB()
)

In [12]:
%time pipe_nb.fit(X_train, y_train);

CPU times: user 1min 38s, sys: 3.66 s, total: 1min 42s
Wall time: 1min 42s


In [13]:
%time y_pred = pipe_nb.predict(X_test)

CPU times: user 11.3 s, sys: 69.8 ms, total: 11.3 s
Wall time: 11.3 s


In [14]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 94.6%


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     32016
           1       0.97      0.92      0.94     31914

    accuracy                           0.95     63930
   macro avg       0.95      0.95      0.95     63930
weighted avg       0.95      0.95      0.95     63930



In [16]:
%time dump(pipe_nb, "/home/jupyter/data/yelp/yelp_model/nb.joblib")

CPU times: user 1min 13s, sys: 2.2 s, total: 1min 15s
Wall time: 1min 15s


['/home/jupyter/data/yelp/yelp_model/nb.joblib']

### Logistic Regression Model

In [17]:
pipe_logreg = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    LogisticRegression(max_iter=1000)
)

In [18]:
%time pipe_logreg.fit(X_train, y_train)

CPU times: user 1h 9min 22s, sys: 1h 9min 21s, total: 2h 18min 43s
Wall time: 11min 16s


In [19]:
%time y_pred = pipe_logreg.predict(X_test)

CPU times: user 11.4 s, sys: 16.8 ms, total: 11.4 s
Wall time: 11.4 s


In [20]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 97.3%


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [22]:
%time dump(pipe_logreg, "/home/jupyter/data/yelp/yelp_model/logreg.joblib")

CPU times: user 1min 14s, sys: 2.12 s, total: 1min 16s
Wall time: 1min 16s


['/home/jupyter/data/yelp/yelp_model/logreg.joblib']

### Support Vector Machine

In [23]:
pipe_svm = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    SGDClassifier(max_iter=100, tol=None)
)

In [24]:
%time pipe_svm.fit(X_train, y_train)

CPU times: user 2min 43s, sys: 8.47 s, total: 2min 51s
Wall time: 2min 44s


In [25]:
y_pred = pipe_svm.predict(X_test)

In [26]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred))

0.9735491944314093


In [27]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [28]:
%time dump(pipe_svm, "/home/jupyter/data/yelp/yelp_model/svm.joblib")

CPU times: user 1min 16s, sys: 2 s, total: 1min 18s
Wall time: 1min 18s


['/home/jupyter/data/yelp/yelp_model/svm.joblib']

In [29]:
!ls -l /home/jupyter/data/yelp/yelp_model/

total 1570956
-rw-r--r-- 1 jupyter jupyter 430818365 May 21 21:43 logreg.joblib
-rw-r--r-- 1 jupyter jupyter 747009306 May 21 21:30 nb.joblib
-rw-r--r-- 1 jupyter jupyter 430818618 May 21 21:47 svm.joblib


In [30]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sun, 21 May 2023 16:47:59'