In [2]:
import pandas as pd

df = pd.read_csv('../data/AMAZON-REVIEW-DATA-CLASSIFICATION.csv')



In [3]:
df.head(10)

Unnamed: 0,reviewText,summary,verified,time,log_votes,isPositive
0,"PURCHASED FOR YOUNGSTER WHO\nINHERITED MY ""TOO...",IDEAL FOR BEGINNER!,True,1361836800,0.0,1.0
1,unable to open or use,Two Stars,True,1452643200,0.0,0.0
2,Waste of money!!! It wouldn't load to my system.,Dont buy it!,True,1433289600,0.0,0.0
3,I attempted to install this OS on two differen...,I attempted to install this OS on two differen...,True,1518912000,0.0,0.0
4,I've spent 14 fruitless hours over the past tw...,Do NOT Download.,True,1441929600,1.098612,0.0
5,I purchased the home and business because I wa...,Quicken home and business not for amatures,True,1335312000,0.0,0.0
6,The download doesn't take long at all. And it'...,Great!,True,1377993600,0.0,1.0
7,This program is positively wonderful for word ...,Terrific for practice.,False,1158364800,2.397895,1.0
8,Fantastic protection!! Great customer support!!,Five Stars,True,1478476800,0.0,1.0
9,Obviously Win 7 now the last great operating s...,Five Stars,True,1471478400,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   reviewText  69989 non-null  object 
 1   summary     69986 non-null  object 
 2   verified    70000 non-null  bool   
 3   time        70000 non-null  int64  
 4   log_votes   70000 non-null  float64
 5   isPositive  70000 non-null  float64
dtypes: bool(1), float64(2), int64(1), object(2)
memory usage: 2.7+ MB


In [5]:
df.isna().sum()

reviewText    11
summary       14
verified       0
time           0
log_votes      0
isPositive     0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,time,log_votes,isPositive
count,70000.0,70000.0,70000.0
mean,1370112000.0,0.535257,0.624171
std,114998600.0,0.962677,0.48434
min,942192000.0,0.0,0.0
25%,1322870000.0,0.0,0.0
50%,1406160000.0,0.0,1.0
75%,1448669000.0,1.098612,1.0
max,1538438000.0,7.110696,1.0


In [7]:
df["isPositive"].value_counts()

1.0    43692
0.0    26308
Name: isPositive, dtype: int64

In [8]:
# Install the library and functions
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Let's get a list of stop words from the NLTK library
stop = stopwords.words('english')

# These words are important for our problem. We don't want to remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# New stop word list
stop_words = [word for word in stop if word not in excluding]

snow = SnowballStemmer('english')

def process_text(texts): 
    final_text_list=[]
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""
            
        filtered_sentence=[]
        
        sent = sent.lower() # Lowercase 
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:
        
        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):  
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words
 
        final_text_list.append(final_string)
        
    return final_text_list


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df[["reviewText", "summary", "time", "log_votes"]],
                                                  df["isPositive"],
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=360
                                                 )

X_val, X_test, y_val, y_test = train_test_split(X_val,
                                                y_val,
                                                test_size=0.5,
                                                shuffle=True,
                                                random_state=360)

In [12]:
print("Processing the reviewText fields")
X_train["reviewText"] = process_text(X_train["reviewText"].tolist())
X_val["reviewText"] = process_text(X_val["reviewText"].tolist())
X_test["reviewText"] = process_text(X_test["reviewText"].tolist())

print("Processing the summary fields")
X_train["summary"] = process_text(X_train["summary"].tolist())
X_val["summary"] = process_text(X_val["summary"].tolist())
X_test["summary"] = process_text(X_test["summary"].tolist())


Processing the reviewText fields
Processing the summary fields


In [13]:
# Grab model features/inputs and target/output
numerical_features = ['time',
                      'log_votes']

text_features = ['summary',
                 'reviewText']

model_features = numerical_features + text_features
model_target = 'isPositive'

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### COLUMN_TRANSFORMER ###
##########################

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', MinMaxScaler()) 
                                ])
# Preprocess 1st text feature
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50))
                                ])

# Preprocess 2nd text feature (larger vocabulary)
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=150))
                                ])

# Combine all data preprocessors from above (add more, if you choose to define more!)
# For each processor/step specify: a name, the actual process, and finally the features to be processed
data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1])
                                    ]) 

### DATA PREPROCESSING ###
##########################

print('Datasets shapes before processing: ', X_train.shape, X_val.shape, X_test.shape)

X_train = data_preprocessor.fit_transform(X_train).toarray()
X_val = data_preprocessor.transform(X_val).toarray()
X_test = data_preprocessor.transform(X_test).toarray()

print('Datasets shapes after processing: ', X_train.shape, X_val.shape, X_test.shape)

Datasets shapes before processing:  (56000, 4) (7000, 4) (7000, 4)
Datasets shapes after processing:  (56000, 202) (7000, 202) (7000, 202)


We will call the Sagemaker LinearLearner() below.

Compute power: We will use train_instance_count and train_instance_type parameters. This example uses ml.m4.xlarge resource for training. We can change the instance type for our needs (For example GPUs for neural networks).

Model type: predictor_type is set to binary_classifier, as we have a binary classification problem here; multiclass_classifier could be used if there are 3 or more classes involved, or 'regressor' for a regression problem.

In [15]:
import sagemaker

# Call the LinearLearner estimator object
linear_classifier = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                           train_instance_count=1,
                                           train_instance_type='ml.m4.xlarge',
                                           predictor_type='binary_classifier')

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

In [16]:
train_records = linear_classifier.record_set(X_train.astype("float32"),
                                            y_train.values.astype("float32"),
                                            channel='train')
val_records = linear_classifier.record_set(X_val.astype("float32"),
                                          y_val.values.astype("float32"),
                                          channel='validation')
test_records = linear_classifier.record_set(X_test.astype("float32"),
                                           y_test.values.astype("float32"),
                                           channel='test')

NameError: name 'linear_classifier' is not defined

fit() function applies a distributed version of the Stochastic Gradient Descent (SGD) algorithm and we are sending the data to it. We disabled logs with logs=False. You can remove that parameter to see more details about the process. This process takes about 3-4 minutes on a ml.m4.xlarge instance.

In [17]:
%%time
linear_classifier.fit([train_records,
                       val_records,
                       test_records],
                      logs=False)

NameError: name 'linear_classifier' is not defined

We can use Sagemaker analytics to get some performance metrics of our choice on the test set. This doesn't require us to deploy our model. Since this is a binary classfication problem, we can check the accuracy. 

In [18]:
sagemaker.analytics.TrainingJobAnalytics(linear_classifier._current_job_name, 
                                         metric_names = ['test:binary_classification_accuracy']
                                        ).dataframe()

NameError: name 'linear_classifier' is not defined


In the last part of this exercise, we will deploy our model to another instance of our choice. This will allow us to use this model in production environment. Deployed endpoints can be used with other AWS Services such as Lambda and API Gateway. A nice walkthrough is available here: https://aws.amazon.com/blogs/machine-learning/call-an-amazon-sagemaker-model-endpoint-using-amazon-api-gateway-and-aws-lambda/ if you are interested.

Run the following cell to deploy the model. We can use different instance types such as: ml.t2.medium, ml.c4.xlarge etc. This will take some time to complete

In [19]:
%%time
linear_classifier_predictor = linear_classifier.deploy(initial_instance_count = 1,
                                                       instance_type = 'ml.t2.medium',
                                                       endpoint_name = 'LinearLearnerEndpoint'
                                                      )

NameError: name 'linear_classifier' is not defined

In [20]:
import numpy as np

# Let's get test data in batch size of 25 and make predictions.
prediction_batches = [linear_classifier_predictor.predict(batch)
                      for batch in np.array_split(X_test.astype("float32"), 25)
                     ]

# Let's get a list of predictions
print([pred.label['score'].float32_tensor.values[0] for pred in prediction_batches[0]])

NameError: name 'linear_classifier_predictor' is not defined

In [21]:
sagemaker_session = sagemaker.Session()
sagemaker_session.delete_endpoint(linear_classifier_predictor.endpoint)

ValueError: Must setup local AWS configuration with a region supported by SageMaker.