# NLP with Machine Learning

## 1. Sentiment Analysis

In [1]:
import pandas as pd

# create a list of sentences
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "iced tea is my favorite",
    "I didn't like the taste of that lemonade at all.",
    "My lemons went bad before I could use them, unfortunately.",
]

# expand the column width to see the full sentences
pd.set_option('display.max_colwidth', None)

# turn it into a dataframe
data_df = pd.DataFrame(data, columns=["sentence"])
data_df.head()

# make a copy of the dataframe
df = data_df.copy()
df.head()

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
test = df.sentence[0] 
test

'When life gives you lemons, make lemonade! 🙂'

In [6]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(test)

{'neg': 0.0, 'neu': 0.75, 'pos': 0.25, 'compound': 0.4587}

In [7]:
analyzer.polarity_scores(test)['compound']

0.4587

In [8]:
def get_sentiment( text ) :
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)['compound']

In [11]:
df['sentiment'] = df.sentence.apply(get_sentiment)
df

Unnamed: 0,sentence,sentiment
0,"When life gives you lemons, make lemonade! 🙂",0.4587
1,She bought 2 lemons for $1 at Maven Market.,0.0
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],0.0
3,"lemon, lemon, lemons, lemon, lemon, lemons",0.0
4,He's running to the market to get a lemon — there's a great sale today.,0.6249
5,iced tea is my favorite,0.4588
6,I didn't like the taste of that lemonade at all.,-0.2755
7,"My lemons went bad before I could use them, unfortunately.",-0.7096


## 3. Topic Modeling
* Goal: Find the main themes in the reviews

In [14]:
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import os

# 1) โหลดไฟล์ .env (ค้นหาเริ่มจากโฟลเดอร์โน้ตบุ๊ก)
load_dotenv(find_dotenv(usecwd=True))

# 2) ดึงค่าและจัดการเครื่องหมายคำพูด/ขยาย ~
raw = os.getenv("FILE_PATH")
if not raw:
    raise RuntimeError("ไม่พบตัวแปร FILE_PATH ใน .env")
base_dir = Path(raw.strip().strip('"\'' )).expanduser().resolve()

print("BASE DIR =", base_dir)
print("มีอยู่จริงไหม? ->", base_dir.exists())

# 3) ใช้ join แบบ pathlib (ไม่ต้องห่วงเรื่อง / ท้าย path)
some_file = base_dir / "dataset.csv"    # ตัวอย่างไฟล์
print(some_file)

BASE DIR = /Users/akanitkwangkaew/Documents/Data-Projects/nlp/On_the_Git
มีอยู่จริงไหม? -> True
/Users/akanitkwangkaew/Documents/Data-Projects/nlp/On_the_Git/dataset.csv


In [16]:
# read in the pop chip reviews
_file = base_dir / "Data/Popchip_Reviews.xlsx" 
reviews = pd.read_excel(_file)
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more."


In [17]:
# there are 564 total reviews
reviews.shape

(564, 6)

In [18]:
# number of low vs high priority reviews
reviews.Priority.value_counts()

Priority
Low     447
High    117
Name: count, dtype: int64

In [22]:
# run this code in the command line if you get an error: python -m spacy download en_core_web_sm

# import the text prepreocessing steps we created in the last section
import maven_text_preprocessing

# apply them to the reviews
reviews['Text_Clean'] = maven_text_preprocessing.clean_and_normalize(reviews['Text'])
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come


In [104]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# create a new tfidf vectorizer with a lower document frequency range to capture more unique words
tv2 = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=.2)
Xt2 = tv2.fit_transform(reviews.Text_Clean)
Xt_df2 = pd.DataFrame(Xt2.toarray(), columns=tv2.get_feature_names_out())
Xt_df2

Unnamed: 0,100,alternative,amazon,bad,bake,baked,bbq,big,bit,box,...,thing,think,time,variety,ve,vinegar,want,way,weight,work
0,0.0,0.465515,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.348295,0.193511,0.000000,0.000000,0.000000
2,0.0,0.000000,0.354088,0.000000,0.0,0.000000,0.000000,0.428869,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.354475,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0.0,0.324462,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.299888,0.0,0.000000,0.000000,0.000000,0.337388,0.657147,0.000000
560,0.0,0.000000,0.190702,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.197896,0.000000,0.208227,0.000000,0.000000,0.247474
561,0.0,0.378621,0.000000,0.380993,0.0,0.396437,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
562,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.543142,0.000000,0.000000,0.000000,0.000000,0.000000


In [105]:
from sklearn.decomposition import NMF

In [106]:
nmf = NMF(n_components=5, random_state=42, max_iter=500)
W = nmf.fit_transform(Xt_df2) # documents-topics
H = nmf.components_ # topics-terms

In [107]:
H.shape

(5, 81)

In [108]:
H[0]

array([2.27467301e-01, 0.00000000e+00, 1.05812554e+00, 1.90686508e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.35972253e-01,
       1.35628300e-02, 5.60885872e-01, 0.00000000e+00, 9.25628481e-01,
       0.00000000e+00, 2.64011798e-01, 1.48778731e-02, 4.32519339e-02,
       2.06228857e-01, 4.84590839e-02, 2.34325144e-01, 0.00000000e+00,
       6.92355490e-02, 9.77982923e-02, 7.64422035e-02, 8.50347323e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.36187942e-01,
       0.00000000e+00, 6.54084363e-02, 8.57621722e-02, 1.39633702e-01,
       0.00000000e+00, 5.47357474e-02, 0.00000000e+00, 3.98725569e-01,
       2.82558641e-02, 0.00000000e+00, 1.24328433e-01, 1.08049224e-01,
       1.46707201e-01, 0.00000000e+00, 1.36364600e-02, 0.00000000e+00,
       0.00000000e+00, 2.29103253e+00, 1.17250511e-01, 2.75259158e-01,
       0.00000000e+00, 2.09892619e-01, 4.42006137e-01, 2.90102692e-01,
       3.15638492e-02, 1.70030147e-01, 1.67667917e-01, 0.00000000e+00,
      

In [109]:
def display_topics(H, num_words=10):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:num_words]
        top_words = [tv2.get_feature_names_out()[i] for i in top_features]
        # print(topic_num, top_features, top_words)
        print("Topic", topic_num+1, ":", ', '.join(top_words))
        


In [110]:
display_topics(H)

Topic 1 : order, amazon, case, time, store, box, thing, price, know, product
Topic 2 : sweet, salty, br, light, rice, texture, think, little, crunchy, fry
Topic 3 : healthy, alternative, bbq, delicious, regular, feel, work, enjoy, nice, look
Topic 4 : br, vinegar, bbq, favorite, pepper, original, lime, think, sea, sour
Topic 5 : fat, low, weight, pop, regular, serve, diet, crunch, single, tasty


In [111]:
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come


In [113]:
doc_topics = pd.DataFrame(W)
doc_topics.columns = ["orders", "test & texture", "good", "flavor", "health"]  #ตีความเอง
doc_topics

Unnamed: 0,orders,test & texture,good,flavor,health
0,0.000000,0.000000,0.403012,0.000000,0.000000
1,0.055080,0.000000,0.023755,0.115179,0.088048
2,0.067787,0.000000,0.000000,0.000000,0.153890
3,0.017647,0.002463,0.000000,0.000000,0.029204
4,0.000000,0.016166,0.040860,0.044669,0.190659
...,...,...,...,...,...
559,0.025953,0.010370,0.050308,0.000000,0.168847
560,0.108660,0.000000,0.022080,0.157261,0.032282
561,0.084727,0.000000,0.200482,0.000000,0.091203
562,0.019073,0.000000,0.000000,0.085505,0.037631


In [114]:
reviews.head(1)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save


In [116]:
reviews_topics = pd.concat([reviews.Text, doc_topics], axis=1)
reviews_topics

Unnamed: 0,Text,orders,test & texture,good,flavor,health
0,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.000000,0.000000,0.403012,0.000000,0.000000
1,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.055080,0.000000,0.023755,0.115179,0.088048
2,"I just love these chips! I was always a big fan of potato chips, but haven't had one since I discovered popchips. They are great for dipping or all alone. I am constantly re-ordering them. One note however-if you are on a low salt diet these chips are probably not for you. They are high in sodium. We go through a case every two months. If you love them it pays to join the subscribe and save program through Amazon. You save money and stay supplied!",0.067787,0.000000,0.000000,0.000000,0.153890
3,"These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day. They were the bomb then, not so much now. Won't buy again unless I get them for cheap or free.",0.017647,0.002463,0.000000,0.000000,0.029204
4,"These chips are great! They look almost like a flattened rice cake, but taste so much better, more like a potato chip. The bbq flavor is delicious. They are very low in fat and full of flavor. It is easy to eat an entire bag of these!",0.000000,0.016166,0.040860,0.044669,0.190659
...,...,...,...,...,...,...
559,"I love potato chips. I could eat them by the bagful but thanks to the powers that be, this would not provide anyone with enough nutrition to survive. Nonetheless I have eaten my share of potato chips over the years, and perhaps as a result I have been watching my weight lately. I discovered these great popchips and they hit the spot in a number of ways. First they are a low-calorie alternative to regular potato chips, meaning I can eat a normal-sized portion and still keep on target with my weight loss goals. Second, they are gluten-free, which is great for those of us who have gluten intolerance issues. Third, they taste great - light and airy, crispy, rich in that great potato flavor that keeps me reaching for another one. This is a great product and I hope it stays around for a good long time.",0.025953,0.010370,0.050308,0.000000,0.168847
560,"When PopChips were really hard to find, I was ordering them by the case from Amazon on a regular basis. The price was always great and these really are my go-to snack. They are always fresh and delicious from Amazon.<br /><br />I definitely prefer Original over barbecue and sour cream & onion. I haven't tried other flavors, but Original works just fine for me because I use it for various dips and will even use dry seasonings on them if I'm in the mood.<br /><br />I don't know anybody that I've offered some of these to who didn't want a bag of their own! Highly recommended.",0.108660,0.000000,0.022080,0.157261,0.032282
561,These are a much healthy alternative to most chips and they taste great. They have a great crunch and flavor and don't have that bad after taste that most baked chips have. My only regret is that I didn't order more when they were on sale there all gone now but even at regular price are worth it.,0.084727,0.000000,0.200482,0.000000,0.091203
562,"These are so good, I've started getting them automatically. I like the original flavor, but they have a lot of others.",0.019073,0.000000,0.000000,0.085505,0.037631
