### Process Followed
#### 1. Get the restaurant reviews
#### 2. Classify negative < 3 positive >=3. and put as a target column 'Actual_Sentiment'
#### 3. Clean the data defining a function 'function_clean'
#### 4. Split into train test dataset
#### 5. Tokenize
#### 6. Lemmatize
#### 7. Remove Stopwords
#### 8. Vectorizing the text using TF_IDF
#### 9. Balance the dataset (I used SMOTE from imblearn)
#### 10. Quick test fitting a logistic model

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")


In [3]:
from azureml.core import Workspace, Dataset, Datastore
subscription_id = '<subscription_id>'
resource_group = '<resource_group>'
workspace_name = '<workspace_name>'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [4]:
#Get the yelp Dataset from adlsgen2
dataset_yelp_review = Dataset.get_by_name(workspace, 'yelp_review')
dataset_yelp_review.take(3).to_pandas_dataframe()

Credentials are not provided to access data from the source. Please sign in using identity with required permission granted.
Interactive sign-in timeout: 120 sec.
To change the sign-in tenant, restart the session with tenant ID set to environment variable "AZUREML_DATA_ACCESS_TENANT_ID" before sign in.
To always use device code for interactive sign-in, set environment variable "AZUREML_DATA_ACCESS_USE_DEVICE_CODE" to "true".
To configure timeout, set environment variable "AZUREML_DATA_ACCESS_INTERACT_TIMEOUT" to the number of seconds.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code AKKFACYAL to authenticate.


InteractiveBrowserCredential.get_token failed: Failed to open a browser


Unnamed: 0,review_id,business_id,stars,useful,funny,cool,date,text,user_id
0,xQY8N_XvtGbearJ5X4QryQ,-MhfebM0QIsKt87iDN-FNw,2,5,0,0,2015-04-15,"As someone who has worked with many museums, I...",OwjRMXRC0KyPrIlcjaXeFQ
1,UmFMZ8PyXZTY2QcwzsfQYA,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,2013-12-07,I am actually horrified this place is still in...,nIJD_7ZXHq-FX8byPMOkMQ
2,LG2ZaYiOgpr2DK_90pYjNw,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,2015-12-05,I love Deagan's. I do. I really do. The atmosp...,V34qejxNsCbcgD8C0HVk-Q


In [5]:
#Conver to Pandas Dataframe
df_yelp_review = dataset_yelp_review.to_pandas_dataframe()

In [6]:
# Create new column "Actual_Sentiment" that stored 0's or 1's.
# 0 being Negative, 1 being Positive
import numpy as np
df_yelp_review["Actual_Sentiment"] = np.where(df_yelp_review["stars"] >= 3, 1, 0)
df_yelp_review.head(3)

Unnamed: 0,review_id,business_id,stars,useful,funny,cool,date,text,user_id,Actual_Sentiment
0,xQY8N_XvtGbearJ5X4QryQ,-MhfebM0QIsKt87iDN-FNw,2,5,0,0,2015-04-15,"As someone who has worked with many museums, I...",OwjRMXRC0KyPrIlcjaXeFQ,0
1,UmFMZ8PyXZTY2QcwzsfQYA,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,2013-12-07,I am actually horrified this place is still in...,nIJD_7ZXHq-FX8byPMOkMQ,0
2,LG2ZaYiOgpr2DK_90pYjNw,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,2015-12-05,I love Deagan's. I do. I really do. The atmosp...,V34qejxNsCbcgD8C0HVk-Q,1


In [7]:
#We only need the text (review) and Actual_Sentiment columns for sentiment prediction
df_ml_yelp_review = df_yelp_review[['text','Actual_Sentiment']]
df_ml_yelp_review.head(5)

Unnamed: 0,text,Actual_Sentiment
0,"As someone who has worked with many museums, I...",0
1,I am actually horrified this place is still in...,0
2,I love Deagan's. I do. I really do. The atmosp...,1
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0
4,"Oh happy day, finally have a Canes near my cas...",1


In [8]:
df_ml_yelp_review.shape

(1009534, 2)

In [10]:
#Build punctuation dictionary
import unicodedata
import sys

# Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))
# Add the backtick/ Grave accent character
punctuation.update({96:None})

In [11]:
#Cant use \W as it strips the spaces
s = 'ab5z8d *$&Y@#'
#regx = re.compile('\W')
#result = regx.findall(s)
s = re.sub('\W', "", s)
print (s)


ab5z8dY


In [12]:
#Let us clean up and make the data ready
import re 
def function_clean(text):
    #convert into lowercase
    text = text.lower()
    #removing the URL Http
    text = re.sub(r"http\S+", "", text) 
    # Removal of mentions
    #text = re.sub("@[^\s]*", "", text)
    # Removal of hashtags
    #text = re.sub("#[^\s]*", "", text)
    # Removal of numbers
    text = re.sub('[0-9]*[+-:]*[0-9]+', '', text)
    text = re.sub("'s", "", text)   
    #remove all punctuation from the text.
    text = str(text.translate(punctuation))
    
    return text

In [13]:
# applying the cleaning function to text column
df_ml_yelp_review['text'] = df_ml_yelp_review['text'].apply(lambda text: function_clean(text))

In [14]:
df_ml_yelp_review.head(5)

Unnamed: 0,text,Actual_Sentiment
0,as someone who has worked with many museums i ...,0
1,i am actually horrified this place is still in...,0
2,i love deagan i do i really do the atmosphere ...,1
3,dismal lukewarm defrostedtasting texmex glop\n...,0
4,oh happy day finally have a canes near my casa...,1


In [15]:
#Splitting the data to train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_ml_yelp_review["text"]
y = df_ml_yelp_review["Actual_Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [16]:
y.value_counts()

1    771908
0    237626
Name: Actual_Sentiment, dtype: int64

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((807627,), (201907,), (807627,), (201907,))

In [18]:
#Now it is time to preprocess the reviews because all these modifications will directly affect the classifier’s performance.
# As we are going to use words as features so we can use some text formatting techniques which will help us in feature extraction
#  including removing punctuation marks/digits ,and also stop-words. In addition, the implementation of lemmatization words using NLTK
#   can be workable to maximize the performance. Tokenization is the last step to break reviews up into words and other meaningful tokens.
import string
#pip install nltk
import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords 
ENGLISH_STOP_WORDS = stopwords.words('english')

def my_tokenizer(sentence):

    listofwords = sentence.strip().split()          # to remove any space from beginning and the end of text
    tokenized_words = []    
    for word in listofwords:
        if not word in ENGLISH_STOP_WORDS:
            lemm_word = WordNetLemmatizer().lemmatize(word)
            if len(lemm_word)>0:
                tokenized_words.append(lemm_word)
    return(tokenized_words)

In [19]:
#lem = nltk.stem.wordnet.WordNetLemmatizer()
#nltk.download('wordnet')

In [20]:
#Vectorizing the text using TF_IDF

#By implementing the sklearn library, we can use TF_IDF vectorizing to find 
#the weighted words that occur more frequently in the document that leads to
# creation of the bag of words model. So our features will be the words or sequence of
# words of these reviews. We are going to explore different models with
# the combinations of n_grams (unigrams,bigrams,trigrams).
#min_df float or int, default=1
#When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
# This value is also called cut-off in the literature. If float in range of [0.0, 1.0], 
# the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

#ngram_rangetuple (min_n, max_n), default=(1, 1)
#The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such 
#that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means
# unigrams and bigrams, and (2, 2) means only bigrams. Only applies if analyzer is not callable.

#tokenizer callable, default=None
#Override the string tokenization step while preserving the preprocessing and n-grams generation steps. 
#Only applies if analyzer == 'word'.


from sklearn.feature_extraction.text import TfidfVectorizer
vect_1 = TfidfVectorizer(min_df=100,tokenizer=my_tokenizer, stop_words={'english'}, ngram_range=(1,3)).fit(X_train)
X_train1 = vect_1.transform(X_train)
X_test1 = vect_1.transform(X_test)


In [22]:
#counting most repetitive words 
word_counts = np.array(np.sum(X_train1, axis=0)).reshape((-1,))
words = np.array(vect_1.get_feature_names())
words_df = pd.DataFrame({"word":words, "count":word_counts})
words_df.sort_values(by="count",ascending=False).head(20)

Unnamed: 0,word,count
20184,food,16256.512063
42479,place,15897.399134
25242,great,15711.074147
24084,good,15048.560139
50479,service,13057.707995
57633,time,12605.651439
22530,get,10409.801529
31651,like,10291.548429
39072,one,10017.469266
3865,back,9919.693044


In [23]:
#pip install sklearn --upgrade

In [24]:
#pip install -U imbalanced-learn
#pip install imblearn
#pip install delayed


In [25]:
#The only challenge that we’ve faced was about balancing the train dataset in terms of having the equal 
#numbers of positive and negative reviews for our two classes. 
#So we are using SMOTE to balanace our target(class) column.
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

#SMOTE the training data
sm = SMOTE(random_state=1)
X_bal, y_bal = sm.fit_resample(X_train1, y_train)


In [48]:
X_bal.shape

(1234728, 65450)

In [52]:
print(type(X_bal))

<class 'scipy.sparse.csr.csr_matrix'>


In [26]:
y_bal.value_counts()

1    617364
0    617364
Name: Actual_Sentiment, dtype: int64

In [41]:
df_y_bal = pd.DataFrame(y_bal)

#### Wanted to show here the real life problem when we run out of memeory:)

In [49]:
#cannot happen
df_X_bal = pd.DataFrame(X_bal.toarray())

MemoryError: Unable to allocate 602. GiB for an array with shape (1234728, 65450) and data type float64

In [46]:
from azureml.data.datapath import DataPath
from azureml.data.dataset_factory import TabularDatasetFactory
#Usage: register_pandas_dataframe(dataframe, target, name, description=None, tags=None, show_progress=True)


datastore_default = workspace.get_default_datastore()
data_path_X_bal = DataPath(datastore=datastore_default, path_on_datastore='X_bal')
TabularDatasetFactory.register_pandas_dataframe(df_X_bal, data_path_X_bal, 'X_bal', show_progress=True)
data_path_y_bal = DataPath(datastore=datastore_default, path_on_datastore='y_bal')
TabularDatasetFactory.register_pandas_dataframe(df_y_bal, data_path_y_bal, 'y_bal', show_progress=True)

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.
Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to X_bal/19df57e1-2d75-41b7-97f9-e43618be3dbc/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to y_bal/68378b0d-ba32-4e31-b1d9-5a56df46245e/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


{
  "source": [
    "('workspaceblobstore', 'y_bal/68378b0d-ba32-4e31-b1d9-5a56df46245e/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "8d065503-9942-4107-89de-80ed849257f8",
    "name": "y_bal",
    "version": 1,
    "workspace": "Workspace.create(name='Houston-techsummit-workspace', subscription_id='7ca151c5-e4f7-4663-9583-834f4e0e6ed4', resource_group='calcutta_demos')"
  }
}

In [None]:
X_train1 = vect_1.transform(X_train)
X_test1 = vect_1.transform(X_test)

In [27]:
# fitting a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# Fitting Logistic regression to the training set
logreg = LogisticRegression(solver='lbfgs',multi_class='auto',random_state=1)
logreg.fit(X_bal, y_bal)

# Predicting the test set results
y_pred_logreg = logreg.predict(X_test1)

# Training score
print(f"Score on training set: {logreg.score(X_train1,y_train)}")
print(f"Score on test set: {logreg.score(X_test1,y_test)}")

Score on training set: 0.935764901371549
Score on test set: 0.9266840674171772


In [28]:
from sklearn.metrics import classification_report
print('The Confusion Matrix')
con_mat_lr = confusion_matrix(y_test, y_pred_logreg)
df_cm_lr = pd.DataFrame(con_mat_lr, columns = ['Predicted 0','Predicted 1'], index = ['True 0','True 1'])
display(df_cm_lr)
print('The Classification report')
report = classification_report(y_test, y_pred_logreg, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report

The Confusion Matrix
The Classification report


Unnamed: 0,Predicted 0,Predicted 1
True 0,42496,4867
True 1,9936,144608


Unnamed: 0,precision,recall,f1-score,support
0,0.810497,0.89724,0.851666,47363.0
1,0.967439,0.935708,0.951309,154544.0
accuracy,0.926684,0.926684,0.926684,0.926684
macro avg,0.888968,0.916474,0.901487,201907.0
weighted avg,0.930624,0.926684,0.927935,201907.0
