In [1]:
import pandas as pd

In [2]:
# reading data
df = pd.read_csv("AmazonMobile.csv", encoding="ISO-8859-1")
df.tail()

Unnamed: 0,OriginalTweet,Sentiment
71917,Best phone at this price.,positive
71918,If you intend to use this phone on T Mobile be...,positive
71919,Here is my Moto G7 Play complaint: It freezes ...,positive
71920,As far as function works great camera no go wo...,positive
71921,"What a great phone! Sleek, fast, great soundin...",positive


In [3]:
# take a look at the type, number of columns, entries, null values etc..
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71922 entries, 0 to 71921
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  71922 non-null  object
 1   Sentiment      71922 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


OriginalTweet    0
Sentiment        0
dtype: int64

In [5]:
# value percentage
print("Value Percentage: \n",df['Sentiment'].value_counts() * 100 / len(df['Sentiment']))

Value Percentage: 
 positive    79.964406
negative    20.035594
Name: Sentiment, dtype: float64


In [6]:
# Chuyển các từ có nghĩa giống nhau thành 1 dạng: ví dụ include, included -> include
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
stop_words = stopwords.words("english")

In [7]:
def remove_stopwords(text):
    """Lọc lại các tweet

    returns: corpus of stemmed words"""

    text = re.sub('^a-zA-Z', ' ', text)
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text).replace('  ', ' ')
    return text

In [8]:
# Lọc tweet

df["removed_stopwords"] = df.OriginalTweet.apply(remove_stopwords)
df["removed_stopwords"]

0        i samsung a600 awhil absolut doo doo. you read...
1        due softwar issu nokia sprint phone' text mess...
2        thi great, reliabl phone. i also purchas phone...
3        i love phone all, i realli need one, i expect ...
4        the phone great everi purpos offers, except da...
                               ...                        
71917                                    best phone price.
71918    if intend use phone t mobil awar dual sim inte...
71919    here moto g7 play complaint: it freez everi ti...
71920    as far function work great camera go wors firs...
71921    what great phone! sleek, fast, great sound bui...
Name: removed_stopwords, Length: 71922, dtype: object

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(max_features=2500)
X = cv.fit(df.removed_stopwords)

In [10]:
A = X.transform(df.removed_stopwords)
tfidfTransformer = TfidfTransformer().fit(A)
title_tfid = tfidfTransformer.transform(A)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(title_tfid, df.Sentiment, test_size=0.3, shuffle=False)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)
multi_pred = model.predict(X_test)
multi_pred

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [13]:
from sklearn.metrics import accuracy_score
score_multi = accuracy_score(multi_pred, y_test)
print(f"MultinomialNB Accuracy Score: {score_multi * 100:.2f}%")

MultinomialNB Accuracy Score: 87.37%
