In [2]:
#import packages
#standard packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
%matplotlib inline
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
# stemming and lemmatizing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize #makes tokens
from nltk.stem import PorterStemmer #word stemming
from nltk.stem import WordNetLemmatizer #lemmatizer
from nltk.corpus import stopwords #remove stopwords 
from nltk.sentiment.vader import SentimentIntensityAnalyzer #sentiment analysis

##support vector machine package
from sklearn import svm 
#evaulation metrics
from sklearn import metrics

import nltk
nltk.download('vader_lexicon')


#testing and training set splitting function
from sklearn.model_selection import train_test_split

import re ##regular expressions package that allows us to remove punctuation and change capitalization (among other things)
import string ## package that deals with string operations

from textblob import TextBlob # spell correcting plus others (e.g., sentiment)
print("packages imported")

packages imported


[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
filename = "NL-classification.csv"
df = pd.read_csv(filename)

text = df['Text'].tolist()
print(text[slice(10)])

['I went on a successful date with someone I felt sympathy and connection with.', 'I was happy when my son got 90% marks in his examination', 'I went to the gym this morning and did yoga.', 'We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.', 'I went with grandchildren to butterfly display at Crohn Conservatory', 'I meditated last night.', 'I made a new recipe for peasant bread, and it came out spectacular!', 'I got gift from my elder brother which was really surprising me', 'YESTERDAY MY MOMS BIRTHDAY SO I ENJOYED', 'Watching cupcake wars with my three teen children']


First part keeps all words in their sentences in the array, but no lemmatizing or number removal
Second part lemmatizes and removes numbers/punctuation, but collapses everything into one long sentence string, and we want to keep it as an array

Next: try and run through the array and lemmatize and remove numbers without losing the array of sentences
-> This might also fix the Kernel crash because the array will be shorter and deeper

In [4]:
example_text_1 = text

# use bag of words to turn text into a vector
vectorizer = CountVectorizer()
text_vectors = vectorizer.fit_transform(example_text_1)

print(vectorizer.get_feature_names_out())
print(text_vectors.toarray())

# we can at (binary = true) to fix this!
vectorizer = CountVectorizer(binary=True)
text_vectors = vectorizer.fit_transform(example_text_1)

print(vectorizer.get_feature_names_out())
print(text_vectors.toarray())

['00' '000' '00am' ... 'zootopia' 'zumba' 'zverev']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
['00' '000' '00am' ... 'zootopia' 'zumba' 'zverev']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
pattern = r'[0-9]'

new_list = []
replace = []

for s in example_text_1:
    s = s.lower()
    s = re.sub(pattern, '', s)
    s = re.sub(r'[^\w\s]','', s)
    s = word_tokenize(s)
    for word in s:
        if word not in stop_words:
            replace.append(lemmatizer.lemmatize(word))
    re2 = " ".join(replace)
    new_list.append(re2)
    replace = []


print(new_list[slice(5)])

# use bag of words to turn text into a vector
vectorizer = CountVectorizer()
text_vectors = vectorizer.fit_transform(new_list)

print(vectorizer.get_feature_names_out())
print(text_vectors.toarray())

##------------------------------------------------------------------------------------------##
## Now let's try some of our own sentences that aren't already classified
sent1 = "I enjoyed a successful evening cupcake with my grandchildren" #bonding
sent2 = "I happy bread, surprising myself with a spectacular recipe"   #achievement
sent3 = "I went on walk"                                               #exercise

new_sentences = [sent1, sent2, sent3]

new_list2 = []
replace2 = []

for s in new_sentences:
    s = s.lower()
    s = re.sub(pattern, '', s)
    s = re.sub(r'[^\w\s]','', s)
    s = word_tokenize(s)
    for word in s:
        if word not in stop_words:
            replace2.append(lemmatizer.lemmatize(word))
    re3 = " ".join(replace2)
    new_list2.append(re3)
    replace2 = []


print(new_list2[slice(5)])

# use bag of words to turn text into a vector
new_vectors = vectorizer.transform(new_list2)

print(vectorizer.get_feature_names_out())
print(text_vectors.toarray())

##NEW SENTENCES called "new_vectors" -> these are sentences that have not been included in either training or testing data


##This is the sentiment analysis base code...   can be implemented into more complex things
analyzer = SentimentIntensityAnalyzer()
score = analyzer.polarity_scores("I hate everything")
total_score = score['pos'] - score['neg']
com_score = score['compound']
print(com_score)
print(total_score)

['went successful date someone felt sympathy connection', 'happy son got mark examination', 'went gym morning yoga', 'serious talk friend flaky lately understood good evening hanging', 'went grandchild butterfly display crohn conservatory']
['aadhar' 'aagra' 'aare' ... 'zootopia' 'zumba' 'zverev']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['enjoyed successful evening cupcake grandchild', 'happy bread surprising spectacular recipe', 'went walk']
['aadhar' 'aagra' 'aare' ... 'zootopia' 'zumba' 'zverev']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
-0.5719
-0.787


Notes for meeting:
- Can add extra stop words we want removed? ie "na" from gonna? how to add extra stopwords to the english removal
- hashtags are a pain in the ass: your#fabulous self -> lemmatizes to yourfabulous
- Some weird things get removed as stopwords... "o" from the "O madam"
- Uppercase I and We are not stopwords but lowercase i and we are??

In [13]:
## Add the sentiment analysis to array
sentiment_array = np.array([])

for s in example_text_1:
    score = analyzer.polarity_scores(s)
    com_score = score['compound']
    sentiment_array = np.append(sentiment_array, [com_score])

#print(sentiment_array[slice(10)])

#text_vectors = np.append(text_vectors, sentiment_array, axis = 1)
    

In [None]:
## Start of ML protocol on NLP'd data
#print(text_vectors[1])

labels = df['Label'].tolist()
print(labels[slice(5)])

emotion_list = []

for e in labels:
    if e == "affection":
        emotion_list.append(1)
    else:
        emotion_list.append(0)

print(emotion_list[slice(10)])

In [None]:
## generate some example data
## this is the same data as before, however, the set of points have been
## combined into X
## 0 indicates the point is a blue point and 1 indicates it is an orange
## point, so the SVM here is "asking" is the point orange in the decision
X = text_vectors
Y = emotion_list

# split the data into a 70% for training
# and 30 % for testing... using a specified random_state so that 
# the random split is the "same" everytime we run the cell
# we typically want to specifiy the random_state when we are writing code
# and debugging, otherwise changes in the output may be due to the random
# split of testing/training data rather than an error in the code
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3,random_state=109) # 70% training and 30% test

#Generate the SVM classifier
SVM_classifier = svm.SVC(kernel='linear') # Linear Kernel

#Now train the SVM on the training data from the data set using the .fit function
SVM_classifier.fit(X_train, y_train)

In [None]:
## Now how do we tell if the SVM is working well?
## We will use the testing set to see if the SVM 
## classifier is classifying points correctly.

#define a new variable y_predicted
#which will predict the output values for the X values in the testing set
y_predicted = SVM_classifier.predict(X_test)

print(y_predicted[slice(10)])
print(y_test[slice(10)])

In [None]:
## the entries of the confusion matrix are:
## C[0,0] true negatives 
## C[1,0] false negatives  
## C[1,1] true positives
## C[0,1] false positives

## note that this is slightly different than the 
## confusion matrix on the wikipedia page!

C = metrics.confusion_matrix(y_test,y_predicted)

C

In [None]:
## Generally, for Machine Learning techniques, we want to output:
## Accuracy -- what fraction of the time is the classifier correct
print("Model Accuracy:",metrics.accuracy_score(y_test, y_predicted))

## Precision -- fraction of true positives divided by the true positives and false positives 
print("Precision:",metrics.precision_score(y_test, y_predicted))

## Recall -- fraction of true positives divided by the true positives and false negatives 
print("Recall:",metrics.recall_score(y_test, y_predicted))

In [None]:
##EXERCISE EMOTION

exercise_list = []

for e in labels:
    if e == "exercise":
        exercise_list.append(1)
    else:
        exercise_list.append(0)
        
X = text_vectors
Y = exercise_list

# split the data into a 70% for training
# and 30 % for testing... using a specified random_state so that 
# the random split is the "same" everytime we run the cell
# we typically want to specifiy the random_state when we are writing code
# and debugging, otherwise changes in the output may be due to the random
# split of testing/training data rather than an error in the code
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2) # 70% training and 30% test

#Generate the SVM classifier
SVM_classifier = svm.SVC(kernel='linear') # Linear Kernel

#Now train the SVM on the training data from the data set using the .fit function
SVM_classifier.fit(X_train, y_train)


y_predicted = SVM_classifier.predict(X_test)

print(y_predicted[slice(10)])
print(y_test[slice(10)])

C = metrics.confusion_matrix(y_test,y_predicted)
print(C)


## Accuracy -- what fraction of the time is the classifier correct
print("Model Accuracy:",metrics.accuracy_score(y_test, y_predicted))
## Precision -- fraction of true positives divided by the true positives and false positives 
print("Precision:",metrics.precision_score(y_test, y_predicted))
## Recall -- fraction of true positives divided by the true positives and false negatives 
print("Recall:",metrics.recall_score(y_test, y_predicted))

## Now try it on the non-classified sentences:
new_vec_predict_SVM2 = SVM_classifier.predict(new_vectors)
print("Here: ",new_vec_predict_SVM2)

70/30
[[3607   17]
 [   9   47]]
Model Accuracy: 0.9929347826086956
Precision: 0.734375
Recall: 0.8392857142857143

80/20
[[2410    4]
 [   6   33]]
Model Accuracy: 0.9959233591520587
Precision: 0.8918918918918919
Recall: 0.8461538461538461

- 80/20 split gives slightly higher accuracy, higher precision, lower recall (usually)

In [None]:
##BONDING EMOTION

bonding_list = []

for e in labels:
    if e == "bonding":
        bonding_list.append(1)
    else:
        bonding_list.append(0)
        
X = text_vectors
Y = bonding_list

# split the data into a 70% for training
# and 30 % for testing... using a specified random_state so that 
# the random split is the "same" everytime we run the cell
# we typically want to specifiy the random_state when we are writing code
# and debugging, otherwise changes in the output may be due to the random
# split of testing/training data rather than an error in the code
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3) # 70% training and 30% test
##Can also play with the test/train split

#Generate the SVM classifier
SVM_classifier = svm.SVC(kernel='linear') # Linear Kernel

#Now train the SVM on the training data from the data set using the .fit function
SVM_classifier.fit(X_train, y_train)


y_predicted = SVM_classifier.predict(X_test)

print(y_predicted[slice(10)])
print(y_test[slice(10)])

C = metrics.confusion_matrix(y_test,y_predicted)
print(C)


## Accuracy -- what fraction of the time is the classifier correct
print("Model Accuracy:",metrics.accuracy_score(y_test, y_predicted))
## Precision -- fraction of true positives divided by the true positives and false positives 
print("Precision:",metrics.precision_score(y_test, y_predicted))
## Recall -- fraction of true positives divided by the true positives and false negatives 
print("Recall:",metrics.recall_score(y_test, y_predicted))

## Now try it on the non-classified sentences:
new_vec_predict_SVM = SVM_classifier.predict(new_vectors)
print("Here: ",new_vec_predict_SVM)

In [None]:
## Next steps is to figure out how to add new sentences that don't have emotions associated with them yet and see how well it does
## can also start prep for next week and do some neural net education via youtube

------------------------------------------SPLIT -----------------------------------------------

In [None]:
## Below is now all the Decision Tree fooling around

In [None]:
##The decision tree I created. IN the data slices below, there are ususally a couple incorrect in the first few lines
##This is fine though, because for a 99% accuracy it can still get 36 picks wrong

from sklearn import tree
dec_tree = tree.DecisionTreeClassifier()  #random_state = 109)

X = text_vectors
Y = bonding_list

##Split the data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3) # 70% training and 30% test

#print(X_test[slice(5)])

clf = dec_tree.fit(X_train, y_train)
predict = clf.predict(X_test)


print(predict[slice(20)])
print(y_test[slice(20)])

C = metrics.confusion_matrix(y_test,predict)
print(C)

## Accuracy -- what fraction of the time is the classifier correct
print("Model Accuracy:",metrics.accuracy_score(y_test, predict))
## Precision -- fraction of true positives divided by the true positives and false positives 
print("Precision:",metrics.precision_score(y_test, predict))
## Recall -- fraction of true positives divided by the true positives and false negatives 
print("Recall:",metrics.recall_score(y_test, predict))

## Now try it on the non-classified sentences:
new_vec_predict = clf.predict(new_vectors)
print("Here: ",new_vec_predict)

If code above is right and makes a decision tree - higher accuracy than SVM's, but lower precision and recall. However, both precision and recall vary WIDELY based on the tree -> have had values between 60% and 85%

In [None]:
# ##This is my code for the tree with the data from Lindsey, just to check that it was working. Looks good.

# dec_tree = tree.DecisionTreeClassifier()  #random_state = 109)

# X = [[1,1],[1,2],[1,7],[2,2],[2,4],[2,5],[3,2],[3,4],[3,6],[4,4],[4,6],[4,7],[5,7],[4,1],[5,2],[5,3],[6,2],[6,4],[7,1],[7,3],[7,6],[8,2],[8,5],[8,6]]
# Y = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1]

# ##Split the data
# X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3) # 70% training and 30% test

# clf = dec_tree.fit(X_train, y_train)
# predict = clf.predict(X_test)


# print(predict[slice(10)])
# print(y_test[slice(10)])

# C = metrics.confusion_matrix(y_test,predict)
# print(C)

# ## Accuracy -- what fraction of the time is the classifier correct
# print("Model Accuracy:",metrics.accuracy_score(y_test, predict))
# ## Precision -- fraction of true positives divided by the true positives and false positives 
# print("Precision:",metrics.precision_score(y_test, predict))
# ## Recall -- fraction of true positives divided by the true positives and false negatives 
# print("Recall:",metrics.recall_score(y_test, predict))