In [1]:
import pandas as pd

df = pd.read_csv('../data/AMAZON-REVIEW-DATA-CLASSIFICATION.csv')
df.head(10)

Unnamed: 0,reviewText,summary,verified,time,log_votes,isPositive
0,"PURCHASED FOR YOUNGSTER WHO\nINHERITED MY ""TOO...",IDEAL FOR BEGINNER!,True,1361836800,0.0,1.0
1,unable to open or use,Two Stars,True,1452643200,0.0,0.0
2,Waste of money!!! It wouldn't load to my system.,Dont buy it!,True,1433289600,0.0,0.0
3,I attempted to install this OS on two differen...,I attempted to install this OS on two differen...,True,1518912000,0.0,0.0
4,I've spent 14 fruitless hours over the past tw...,Do NOT Download.,True,1441929600,1.098612,0.0
5,I purchased the home and business because I wa...,Quicken home and business not for amatures,True,1335312000,0.0,0.0
6,The download doesn't take long at all. And it'...,Great!,True,1377993600,0.0,1.0
7,This program is positively wonderful for word ...,Terrific for practice.,False,1158364800,2.397895,1.0
8,Fantastic protection!! Great customer support!!,Five Stars,True,1478476800,0.0,1.0
9,Obviously Win 7 now the last great operating s...,Five Stars,True,1471478400,0.0,1.0


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   reviewText  69989 non-null  object 
 1   summary     69986 non-null  object 
 2   verified    70000 non-null  bool   
 3   time        70000 non-null  int64  
 4   log_votes   70000 non-null  float64
 5   isPositive  70000 non-null  float64
dtypes: bool(1), float64(2), int64(1), object(2)
memory usage: 2.7+ MB
None


In [3]:
print(df.isna().sum())

reviewText    11
summary       14
verified       0
time           0
log_votes      0
isPositive     0
dtype: int64


In [4]:
print(df.describe())

               time     log_votes    isPositive
count  7.000000e+04  70000.000000  70000.000000
mean   1.370112e+09      0.535257      0.624171
std    1.149986e+08      0.962677      0.484340
min    9.421920e+08      0.000000      0.000000
25%    1.322870e+09      0.000000      0.000000
50%    1.406160e+09      0.000000      1.000000
75%    1.448669e+09      1.098612      1.000000
max    1.538438e+09      7.110696      1.000000


In [5]:
# Install the library and functions
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Let's get a list of stop words from the NLTK library
stop = stopwords.words('english')

# These words are important for our problem. We don't want to remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# New stop word list
stop_words = [word for word in stop if word not in excluding]

snow = SnowballStemmer('english')

def process_text(texts): 
    final_text_list=[]
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""
            
        filtered_sentence=[]
        
        sent = sent.lower() # Lowercase 
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:
        
        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):  
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words
 
        final_text_list.append(final_string)
        
    return final_text_list

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df[["reviewText", "summary", "time", "log_votes"]],
                                                  df["isPositive"],
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=360
                                                 )

In [8]:
print("Processing the reviewText fields")
X_train["reviewText"] = process_text(X_train["reviewText"].tolist())
X_val["reviewText"] = process_text(X_val["reviewText"].tolist())

print("Processing the summary fields")
X_train["summary"] = process_text(X_train["summary"].tolist())
X_val["summary"] = process_text(X_val["summary"].tolist())

Processing the reviewText fields
Processing the summary fields


In [9]:
# Grab model features/inputs and target/output
numerical_features = ['time',
                      'log_votes']

text_features = ['summary',
                 'reviewText']

model_features = numerical_features + text_features
model_target = 'isPositive'

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

### COLUMN_TRANSFORMER ###
##########################

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_scaler', MinMaxScaler()) # this can be skipped for trees
])

# Preprocess 1st text feature
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50))
])

# Preprocess 2nd text feature (larger vocabulary)
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=150))
])

# Combine all data preprocessors from above (add more, if you choose to define more!)
# For each processor/step specify: a name, the actual process, and finally the features to be processed
data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1])
]) 

### PIPELINE ###
################

# Pipeline desired all data transformers, along with an estimator at the end
# Later you can set/reach the parameters using the names issued - for hyperparameter tuning, for example
pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('decision_tree', DecisionTreeClassifier(max_depth = 10,
                                             min_samples_leaf = 15))
])

# Visualize the pipeline
# This will come in handy especially when building more complex pipelines, stringing together multiple preprocessing steps
from sklearn import set_config
set_config(display='diagram')
pipeline

In [11]:
# Fit the Pipeline to training data
pipeline.fit(X_train, y_train.values)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[1873  754]
 [ 731 3642]]
              precision    recall  f1-score   support

         0.0       0.72      0.71      0.72      2627
         1.0       0.83      0.83      0.83      4373

    accuracy                           0.79      7000
   macro avg       0.77      0.77      0.77      7000
weighted avg       0.79      0.79      0.79      7000

Accuracy (validation): 0.7878571428571428


In [13]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('decision_tree', RandomForestClassifier(n_estimators=150,
                                             max_depth = 10,
                                             min_samples_leaf = 15))
])

# Fit the Pipeline to training data
pipeline.fit(X_train, y_train.values)

In [14]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[1631  996]
 [ 351 4022]]
              precision    recall  f1-score   support

         0.0       0.82      0.62      0.71      2627
         1.0       0.80      0.92      0.86      4373

    accuracy                           0.81      7000
   macro avg       0.81      0.77      0.78      7000
weighted avg       0.81      0.81      0.80      7000

Accuracy (validation): 0.8075714285714286


In [15]:
from sklearn.model_selection import GridSearchCV

### PIPELINE GRID_SEARCH ###
############################

# Parameter grid for GridSearch
param_grid={'decision_tree__max_depth': [10, 20, 30],#, 15, 25, 35, 45, 55, 75], 
            'decision_tree__min_samples_leaf': [5, 10],#, 15, 30],
           }

grid_search = GridSearchCV(pipeline, # Base model
                           param_grid, # Parameters to try
                           cv = 5, # Apply 5-fold cross validation
                           verbose = 1, # Print summary
                           n_jobs = -1 # Use all available processors
                          )

# Fit the GridSearch to our training data
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  8.4min finished


Best parameters:  {'decision_tree__max_depth': 30, 'decision_tree__min_samples_leaf': 5}
Best score:  0.8423015873015874


In [16]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = grid_search.best_estimator_.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[1978  649]
 [ 470 3903]]
              precision    recall  f1-score   support

         0.0       0.81      0.75      0.78      2627
         1.0       0.86      0.89      0.87      4373

    accuracy                           0.84      7000
   macro avg       0.83      0.82      0.83      7000
weighted avg       0.84      0.84      0.84      7000

Accuracy (validation): 0.8401428571428572


In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Parameter grid for GridSearch
param_grid={'decision_tree__max_depth': [10, 20, 30],#, 15, 25, 35, 45, 55, 75], 
            'decision_tree__min_samples_leaf': [5, 10],#, 15, 30],
           }

random_search = RandomizedSearchCV(pipeline, # Base model
                                 param_grid, # Parameters to try
                                 cv = 5, # Apply 5-fold cross validation
                                 verbose = 1, # Print summary
                                 n_jobs = -1 # Use all available processors
                                )

# Fit the GridSearch to our training data
random_search.fit(X_train, y_train)

print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  9.5min finished


Best parameters:  {'decision_tree__min_samples_leaf': 5, 'decision_tree__max_depth': 30}
Best score:  0.8421587301587301


In [18]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = random_search.best_estimator_.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[1977  650]
 [ 485 3888]]
              precision    recall  f1-score   support

         0.0       0.80      0.75      0.78      2627
         1.0       0.86      0.89      0.87      4373

    accuracy                           0.84      7000
   macro avg       0.83      0.82      0.82      7000
weighted avg       0.84      0.84      0.84      7000

Accuracy (validation): 0.8378571428571429


### Must Try

Preprocessing:

We can usually improve performance with some additional work. You can try the following:
   Change the feature extractor to TF, TF-IDF. Also experiment with different vocabulary size.
   Come up with some other features such as having certain punctuations, all-capitalized words or some words that might be    useful in this problem.

Hyperparameter Tuning: 
        Always a good idea to try other parameter ranges and/or combinations of parameters. If training time is a priority, try RandomizedSearchCV instead of GridSearchCV, it's much faster and with almost as good results. 