# Naive Bayes Project Tutorial



Paso 1

In [None]:
%%capture
!pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
Collecting scipy>=1.3.2
  Using cached scipy-1.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn, sklearn
Successfully installed joblib-1.1.0 scikit-learn-1.1.1 scipy-1.8.1 sklearn-0.0 threadpoolctl-3.1.0


In [3]:
%%capture
!pip install pandas


Collecting pandas
  Downloading pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m36m0:00:01[0m
Collecting pytz>=2020.1
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Collecting numpy>=1.18.5
  Using cached numpy-1.23.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.23.1 pandas-1.4.3 pytz-2022.1


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle

In [7]:
data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [9]:
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [10]:
data = data.drop(columns = 'package_name')

In [11]:
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [12]:
data['review'] = data['review'].str.strip().str.lower()

In [14]:
data['polarity'].value_counts()

0    584
1    307
Name: polarity, dtype: int64

Paso 2

In [15]:
X = data['review']
y = data['polarity']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.25, random_state = 42)

Paso 3

In [18]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words = 'english')

In [19]:
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [20]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
len(X_train)

668

In [26]:
len(X_train[0])

3161

In [28]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [29]:
model.score(X_test, y_test)

0.8565022421524664

In [30]:
# predicción con comentario negativo
model.predict(vec.transform(['I hate with my heart this app ']))

array([0])

In [31]:
# predicción con comentario positivo
model.predict(vec.transform(['Love this app simply awesome!']))

array([1])

In [33]:
# predicción con comentario neutro
model.predict(vec.transform(['Normal']))

array([1])

In [32]:
# predicción con texto largo
string = '''

I love Google Play Store. The website and mobile site are both very easy to maneuver. I think it's great that Google has so many free applications available. I review for Google often and they usually send me an update to say thank you. I use Google Play often and will continue to do so. #GooglePlayStore #Techie #DigitalArt created by yours truly. Follow me on IG to see more original content. #MKR #IamMimisMusic Google Play Store

'''
model.predict(vec.transform([string]))

array([0])

Si el comentario es breve y claro funciona bien, en textos largos o neutros puede fallar

Paso 4

In [None]:
# se guarda el modelo
filename = '../models/nb_model.sav'
pickle.dump(model, open(filename,'wb'))