# NLP with Machine Learning

## 1. Sentiment Analysis

In [None]:
import pandas as pd
#1. pandas

# create a list of sentences
data = [
    "When life gives you lemons, make lemonade! ðŸ™‚",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon â€” there's a great sale today.",
    "iced tea is my favorite",
    "I didn't like the taste of that lemonade at all.",
    "My lemons went bad before I could use them, unfortunately.",
] 
#2. ustawienie szerokosci kolumn
# expand the column width to see the full sentences
pd.set_option('display.max_colwidth', None)

# turn it into a dataframe
data_df = pd.DataFrame(data, columns=["sentence"])
data_df.head()
#3. dataframe i nazwanie kolumny
# make a copy of the dataframe
df = data_df.copy()
df.head()

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.


In [None]:
# 4. import the VADER sentiment library
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

text = df.sentence[0] #wybranie pierwsze elementu [0]
text

'When life gives you lemons, make lemonade! ðŸ™‚'

In [None]:
# 5. create an analyzer object, apply it to the text data and view the polarity scores
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(text)
# wybranie compound czyli mieszanka

{'neg': 0.0, 'neu': 0.75, 'pos': 0.25, 'compound': 0.4587}

In [4]:
# apply the sentiment analyzer to the entire dataframe

# create an analyzer object
analyzer = SentimentIntensityAnalyzer()

# define a function to get the score
def get_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# apply the function
df['sentiment'] = df['sentence'].apply(get_sentiment)
df

Unnamed: 0,sentence,sentiment
0,"When life gives you lemons, make lemonade! ðŸ™‚",0.4587
1,She bought 2 lemons for $1 at Maven Market.,0.0
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],0.0
3,"lemon, lemon, lemons, lemon, lemon, lemons",0.0
4,He's running to the market to get a lemon â€” there's a great sale today.,0.6249
5,iced tea is my favorite,0.4588
6,I didn't like the taste of that lemonade at all.,-0.2755
7,"My lemons went bad before I could use them, unfortunately.",-0.7096


In [None]:
#polish data

In [None]:
import pandas as pd
data_pl = [
    "LubiÄ™ placki ðŸ™‚",
    "DostaÅ‚em 6 z matematyki! ",
    "Nie lubiÄ™ CiÄ™",
    "Kocham szpital",
    "Kocham szkoÅ‚Ä™",
] 

In [None]:


data_pl_fr=pd.DataFrame

## 2. Text Classification

#### GOAL: Predict which reviews are high priority (vs low priority) that we need to address right away

In [5]:
# import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [6]:
# read in the pop chip reviews
reviews = pd.read_excel('../Data/Popchip_Reviews.xlsx')
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more."


In [7]:
# there are 564 total reviews
reviews.shape

(564, 6)

In [8]:
# number of low vs high priority reviews
reviews.Priority.value_counts()

Priority
Low     447
High    117
Name: count, dtype: int64

In [9]:
# run this code in the command line if you get an error: python -m spacy download en_core_web_sm

# import the text prepreocessing steps we created in the last section
import maven_text_preprocessing

# apply them to the reviews
reviews['Text_Clean'] = maven_text_preprocessing.clean_and_normalize(reviews['Text'])
reviews.head(2)

ModuleNotFoundError: No module named 'maven_text_preprocessing'

#### ATTEMPT 1: Naive Bayes with Count Vectorizer

In [None]:
# create a count vectorizer matrix
cv = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=.2)
X = cv.fit_transform(reviews.Text_Clean)

In [None]:
# view the features / inputs X
X_df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
X_df.head()

In [None]:
# view the target / output y
y = reviews.Priority
y.head()

In [None]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# model
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

# predict
y_pred_nb = model_nb.predict(X_test)

# evaluate
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
# test it out on new reviews
new_reviews = pd.Series([
    "Pop chips are my favorite! I love these chips so much.",
    "Taste bad. I don't like the flavor options or taste.",
    "Solid snack."
])

# clean and vectorize the new reviews using the same processes as earlier
new_reviews_clean = maven_text_preprocessing.clean_and_normalize(new_reviews)
new_reviews_df = pd.DataFrame(cv.transform(new_reviews_clean).toarray(), columns=cv.get_feature_names_out())

# make a prediction
model_nb.predict(new_reviews_df)

#### ATTEMPT 2: Logistic Regression with Tfidf Vectorizer

In [None]:
# create a tfidf vectorizer matrix
tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=.2)
Xt = tv.fit_transform(reviews.Text_Clean)

In [None]:
# view the features / inputs X
Xt_df = pd.DataFrame(Xt.toarray(), columns=tv.get_feature_names_out())
Xt_df.head()

In [None]:
# view the target / output y
yt = reviews.Priority
yt.head()

In [None]:
# train/test split
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt_df, yt, test_size=0.2, random_state=42)

# model
model_lr = LogisticRegression()
model_lr.fit(Xt_train, yt_train)

# predict
y_pred_lr = model_lr.predict(Xt_test)

# evaluate
print(classification_report(yt_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
# highest priority reviews
import numpy as np

reviews['predictions_nb'] = model_nb.predict_proba(X_df)[:, 0]
reviews['predictions_lr'] = model_lr.predict_proba(Xt_df)[:, 0]
reviews.sort_values('predictions_nb', ascending=False).head(2)

## 3. Topic Modeling

#### GOAL: Find the main themes in the reviews

In [None]:
# create a new tfidf vectorizer with a lower document frequency range to capture more unique words
tv2 = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=.2)
Xt2 = tv2.fit_transform(reviews.Text_Clean)
Xt_df2 = pd.DataFrame(Xt2.toarray(), columns=tv2.get_feature_names_out())
Xt_df2

In [None]:
# apply nmf with n topics
from sklearn.decomposition import NMF

nmf = NMF(n_components=5, random_state=42, max_iter=500)
W = nmf.fit_transform(Xt_df2) # documents-topics
H = nmf.components_ # topics-terms

In [None]:
# 5 topics & 81 terms for each topic
H.shape

In [None]:
# view a single topic to term mapping
H[0][:20]

In [None]:
# function to display the top terms for each topic
def display_topics(H, num_words=10):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:num_words]
        top_words = [tv2.get_feature_names_out()[i] for i in top_features]
        print("Topic", topic_num+1, ":", ', '.join(top_words))

In [None]:
# test out the function
display_topics(H)

In [None]:
# documents to topics
doc_topics = pd.DataFrame(W)
doc_topics.columns = ['orders', 'taste & texture', 'good', 'flavor', 'health']
doc_topics

In [None]:
# combine the reviews text with the topics
reviews_topics = pd.concat([reviews.Text, doc_topics], axis=1)
reviews_topics.head()

### DEMO: Combine Multiple Techniques

In [None]:
# make a copy of the final dataframe
final_topics = reviews_topics.copy()
final_topics.head(2)

In [None]:
# create a new column that returns only the top topic
final_topics['top_topic'] = final_topics.iloc[:, 1:].idxmax(axis=1)
final_topics.head(2)

In [None]:
# paste down the sentiment function from earlier
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)['compound']

In [None]:
# create a new column in our dataframe containing the sentiment scores
final_topics['sentiment'] = final_topics.Text.apply(get_sentiment)
final_topics.head(2)

In [None]:
# do some eda on the data by finding average sentiment for each topic
final_topics.groupby('top_topic')['sentiment'].mean().sort_values()