# **Title: 6.2 Exercise**
# **Author: Michael J. Montana**
# **Date: 23 April 2023**
# **Modified By: N/A**
# **Description: Working with Predictive Models**

In [26]:
import pandas as pd
import numpy as np
import nltk
from myclassesv3 import Normalize_Corpus

In [27]:
alexa=pd.read_csv('data/amazon_alexa.csv')
alexa.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wr...",1
3,5,31-Jul-18,Charcoal Fabric,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the light...",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [28]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

# <font color=2d5db5>**1. Using the Amazon Alexa reviews dataset, build a logistic regression model to predict positive or negative feedback based on review text. Be sure to run a test with something random you create (out of sample). Remember: 1 is positive, 0 is negative.**

In [29]:
norm = Normalize_Corpus() #instantiating class
cleanalexa = alexa.copy() #creating copy of data
#cleaning data
cleanalexa['verified_reviews']= norm.normalize(cleanalexa['verified_reviews'],html_stripping=True, contraction_expansion=True,
                                             accented_char_removal=True, text_lower_case=False,
                                             text_lemmatization=False, special_char_removal=True,
                                             stopword_removal=True, digits_removal=True,stopwords=stopword_list) #passing Tweet content to the selected normalizer functions

Stripping HTML...
Expanding Contratcions...




Removing Accent Markings...
Removing Special Characters...
Removing Stopwords...
Removing Numbers...
Your Data is Clean


In [30]:
#removing empty rows
clean_alexa_NA_free=cleanalexa.dropna().reset_index(drop=True)
clean_alexa_NA_free.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

clean_df = clean_alexa_NA_free

# split to test and train
train_corpus, test_corpus, train_label_nums, test_label_nums,train_label_names, test_label_names = train_test_split(np.array(clean_df['verified_reviews']),
                                                                                                                    np.array(clean_df['feedback']),
                                                                                                                    np.array(clean_df['variation']),
                                                                                                                    test_size=0.33, random_state=42)
#Building BOW
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
# transform test articles into features
cv_test_features = cv.transform(test_corpus)
# print('Train features shape:', cv_train_features.shape)
# print('Test features shape:', cv_test_features.shape, '\n')

# Logistic Regression - page 316
print('Logistic Regression:')
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=3150, solver='lbfgs',
                        C=1, random_state=42, multi_class='auto')
lr.fit(cv_train_features, train_label_nums)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_nums, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('\tCV Accuracy (5-fold):', lr_bow_cv_scores)
print('\tMean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_nums)
print('\tTest Accuracy:', lr_bow_test_score, '\n')

Logistic Regression:
	CV Accuracy (5-fold): [0.93127962 0.94549763 0.93364929 0.93838863 0.94075829]
	Mean CV Accuracy: 0.937914691943128
	Test Accuracy: 0.9365384615384615 



In [32]:
# Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

# Logistic Regression
print('Logistic Regression:')
# This takes quite a while to run, be patient.
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, solver='lbfgs',
                        C=1, random_state=42, multi_class='auto')
lr.fit(tv_train_features, train_label_nums)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_nums, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('\tCV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('\thMean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_nums)
print('\tTest Accuracy:', lr_tfidf_test_score, '\n')

Logistic Regression:
	CV Accuracy (5-fold): [0.92654028 0.92654028 0.92417062 0.92417062 0.92654028]
	hMean CV Accuracy: 0.9255924170616113
	Test Accuracy: 0.9057692307692308 



# <font color=2d5db5>**2. At the end of Chapter 5, the author uses a custom-built class to summarize model performance. This class doesn’t actually exist (from the author) but you can make it a reality. Using the object you have from mnb_predictions, create something similar to the output on page 335. Feel free (but not obligated) to venture further into the label names and numbers (page 336) and confusion matrix (page 337).**

In [33]:
# model tuning for the the multinomial Naive Bayes model
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2)
gs_mnb = gs_mnb.fit(train_corpus, train_label_nums)

print(gs_mnb.best_estimator_.get_params(), '\n')

cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                            'params': cv_results['params'],
                            'cv score (mean)': cv_results['mean_test_score'],
                            'cv score (std)': cv_results['std_test_score']})
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
print('Modeling tuning results DF:', results_df, '\n')

best_mnb_test_score = gs_mnb.score(test_corpus, test_label_nums)
print('Test Accuracy:', best_mnb_test_score, '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END ........mnb__alpha=1e-05, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END .......mnb__alpha=0.0001, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END .......mnb__alpha=0.0001, tfidf__ngram_

In [34]:
from myclassesv3 import Model_Evaluation
meu=Model_Evaluation() #instantiating class
mnb_predictions = gs_mnb.predict(test_corpus) #running model
unique_classes = list(set(test_label_nums))
meu.get_metrics(true_labels=test_label_nums,
                predicted_labels=mnb_predictions)
print('')
meu.display_classification_report(true_labels=test_label_nums,
                                  predicted_labels=mnb_predictions,
                                  classes=unique_classes)


Accuracy: 0.9308
Precision: 0.9279
Recall: 0.9308
F1 Score: 0.9154

              precision    recall  f1-score   support

           0       0.89      0.31      0.46        99
           1       0.93      1.00      0.96       941

    accuracy                           0.93      1040
   macro avg       0.91      0.65      0.71      1040
weighted avg       0.93      0.93      0.92      1040

