In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 2/SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 6: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path
home_dir = Path(".").resolve()
main_dir = home_dir.parent
data_dir = str(main_dir) + "/data"




In [None]:
#=================================================-
#### Slide 7: Loading packages  ####

# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# Packages with tools for text processing.
import nltk
nltk.download('vader_lexicon')
# Packages for working with text data and analyzing sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
# Packages to build and measure the performance of a logistic regression model 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing




In [None]:
#=================================================-
#### Slide 8: Import data we saved   ####

# Load pickled data and models.
score_labels = pickle.load(open(data_dir + "/score_labels.sav","rb"))
DTM_matrix = pickle.load(open(data_dir + '/DTM_matrix.sav',"rb"))




In [None]:
#=================================================-
#### Slide 9: Text classification - convert DTM to array  ####

DTM_array = DTM_matrix.toarray()
# Let's look at the first few rows of the finalized array. 
print(DTM_array[1:4])




In [None]:
#=================================================-
#### Slide 13: Model building - split the dataset  ####

X_train, X_test, y_train, y_test  = train_test_split(
        DTM_array, 
        score_labels,
        train_size = 0.70, 
        random_state = 1234)




In [None]:
#=================================================-
#### Slide 14: Model building - split the dataset  ####

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))




In [None]:
#=================================================-
#### Slide 24: Categorical to binary target variable  ####

# Initiate the Label Binarizer.
lb = preprocessing.LabelBinarizer()

# Convert y_test to binary integer format. 
y_test= lb.fit_transform(y_test)




In [None]:
#=================================================-
#### Slide 31: Logistic regression: build  ####

# Set up logistic regression model.
log_model = LogisticRegression()
print(log_model)




In [None]:
#=================================================-
#### Slide 32: Logistic regression: fit  ####

# Fit the model.
log_model = log_model.fit(X = X_train, y = y_train)




In [None]:
#=================================================-
#### Slide 34: Logistic regression: predict (cont'd)  ####

# Predict on test data.
y_pred = log_model.predict(X_test)
print(y_pred)
# Convert y_pred to binary integer format. 
y_pred= lb.fit_transform(y_pred)




In [None]:
#=================================================-
#### Slide 36: Exercise 1  ####






In [None]:
#=================================================-
#### Slide 43: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix_test)
# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy on test data: ", test_accuracy_score)




In [None]:
#=================================================-
#### Slide 44: Classification report  ####

# Create a list of target names to interpret class assignments.
target_names = ['Negative', 'Positive']
# Print an entire classification report.
class_report = metrics.classification_report(y_test, 
                                             y_pred, 
                                             target_names = target_names)
print(class_report)




In [None]:
#=================================================-
#### Slide 45: Classification report (cont'd)  ####

print(class_report)




In [None]:
#=================================================-
#### Slide 46: Getting probabilities instead of class labels  ####

# Get probabilities instead of predicted values.
test_probabilities = log_model.predict_proba(X_test)
print(test_probabilities[0:5, :])
# Get probabilities of test predictions only.
test_predictions = test_probabilities[: , 1]
print(test_probabilities[0:5])




In [None]:
#=================================================-
#### Slide 47: Computing FPR, TPR and threshold  ####

# Get FPR, TPR and threshold values.
fpr, tpr, threshold = metrics.roc_curve(y_test,           #<- test data labels
                                        test_predictions) #<- predicted probabilities
print("False positive: ", fpr)
print("True positive: ", tpr)
print("Threshold: ", threshold)




In [None]:
#=================================================-
#### Slide 48: Computing AUC  ####

# Get AUC value
auc = metrics.roc_auc_score(y_test,y_pred)
print("Area under the ROC curve: ", auc)




In [None]:
#=================================================-
#### Slide 49: Putting it all together: ROC plot  ####

# Make an ROC curve plot.
_=plt.title('Receiver Operator Characteristic')
_=plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
_=plt.legend(loc = 'lower right')
_=plt.plot([0, 1], [0, 1],'r--')
_=plt.xlabel('False Positive Rate')
_=plt.ylabel('True Positive Rate')
_=plt.show()


