In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 1/SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 1 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 9: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path
home_dir = Path(".").resolve()
main_dir = home_dir.parent
data_dir = str(main_dir) + "/data"




In [None]:
#=================================================-
#### Slide 10: Loading packages  ####

# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# Packages for working with text data and analyzing sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from sklearn.feature_extraction.text import CountVectorizer

#set up nltk packages
import nltk
nltk.download('all')
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer


In [None]:
#=================================================-
#### Slide 15: Load text data  ####

# Load corpus from a csv (for Mac).
NYT = pd.read_csv(data_dir + '/NYT_article_data.csv')
print(NYT.columns)
# Isolate the snippet column.
NYT_snippet = NYT["snippet"]
# Look at a sample of the snippets column, 0-10.
print(NYT["snippet"][0:10])




In [None]:
#=================================================-
#### Slide 16: Tokenization: split each snippet into words  ####

# Tokenize each snippet into a large list of tokenized snippets.
NYT_tokenized = [word_tokenize(NYT_snippet[i]) for i in range(0, len(NYT_snippet))]





In [None]:
#=================================================-
#### Slide 18: Implementing pre-processing steps on a corpus  ####

# Create a list for clean snippets.
NYT_clean = [None] * len(NYT_tokenized)
# Create a list of word counts for each clean snippet.
word_counts_per_snippet = [None] * len(NYT_tokenized)
# Process words in all snippets.
for i in range(len(NYT_tokenized)):
    # 1. Convert to lower case.
    NYT_clean[i] = [snippet.lower() for snippet in NYT_tokenized[i]]
    
    # 2. Remove stop words.
    stop_words = stopwords.words('english')
    NYT_clean[i] = [word for word in NYT_clean[i] if not word in stop_words]
    
    # 3. Remove punctuation and any non-alphabetical characters.
    NYT_clean[i] = [word for word in NYT_clean[i] if word.isalpha()]
    
    # 4. Stem words.
    NYT_clean[i] = [PorterStemmer().stem(word) for word in NYT_clean[i]]
    
    # Record the word count per snippet.
    word_counts_per_snippet[i] = len(NYT_clean[i])




In [None]:
#=================================================-
#### Slide 19: Inspect results  ####

print(NYT_clean[0][:10])
print(NYT_clean[5][:10])
print(NYT_clean[10][:10])




In [None]:
#=================================================-
#### Slide 20: Removing empty and very short snippets  ####

print(word_counts_per_snippet[:10])




In [None]:
#=================================================-
#### Slide 21: Removing empty and very short snippets (cont'd)  ####

# Convert word counts list and snippets list to numpy arrays.
word_counts_array = np.array(word_counts_per_snippet)
NYT_array = np.array(NYT_clean, dtype=object)
print(len(NYT_array))
# Find indices of all snippets where there are greater than or equal to 5 words.
valid_snippets = np.where(word_counts_array >= 5)[0]
print(len(valid_snippets))




In [None]:
#=================================================-
#### Slide 22: Removing empty and very short snippets (cont'd)  ####

# Subset the NYT_array to keep only those where there are at least 5 words.
NYT_array = NYT_array[valid_snippets]
print(len(NYT_array))

# Convert the array back to a list.
NYT_clean = NYT_array.tolist()
print(NYT_clean[:3])




In [None]:
#=================================================-
#### Slide 23: Save processed text to file using .join()  ####

# Join words in each snippet into a single character string.
NYT_clean_list = [' '.join(snippet) for snippet in NYT_clean]
print(NYT_clean_list[:5])





In [None]:
#=================================================-
#### Slide 26: Create a DTM  ####

# Initialize `CountVectorizer`.
vec = CountVectorizer()

# Transform the list of snippets into DTM.
X = vec.fit_transform(NYT_clean_list)
print(X.toarray()) #<- to show output as a matrix




In [None]:
#=================================================-
#### Slide 32: Exercise 1  ####






In [None]:
#=================================================-
#### Slide 38: Text classification - classify (cont'd)  ####

# Initialize the `SentimentIntensityAnalyzer().`
sid = SentimentIntensityAnalyzer()

# Iterate through each sentence printing out the scores for each.
for sentence in NYT_clean_list[:5]:
     print(sentence)
     ss = sid.polarity_scores(sentence)
     for k in ss:
         print('{0}: {1}, '.format(k, ss[k]), end='')




In [None]:
#=================================================-
#### Slide 40: Text classification - classify (cont'd)  ####

# This function outputs a list of labels for snippet:
def sentiment_analysis(texts):
        list_of_scores = []
        for text in texts:
            sid = SentimentIntensityAnalyzer()               
            compound = sid.polarity_scores(text)["compound"] 
            if compound >= 0:
                list_of_scores.append("positive")
            else:
                list_of_scores.append("negative")
        return(list_of_scores)
score_labels = sentiment_analysis(NYT_clean_list)
print(score_labels[1:5])




In [None]:
#=================================================-
#### Slide 42: Save results as a pickle  ####

pickle.dump(NYT_clean_list, open(data_dir + '/NYT_clean_list.sav', 'wb'))
pickle.dump(score_labels, open(data_dir + '/score_labels.sav', 'wb'))
pickle.dump(X, open(data_dir + '/DTM_matrix.sav', 'wb'))


