In [1]:
import numpy as np
import pandas as pd
import sklearn
import xlrd
import matplotlib.pylab as py
import nltk as nltk
import textblob as textblob
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
#importing the excell file. make sure it is uploaded in the same folder as the notebook. 

PrimaryEmotion = pd.read_csv('emotion.xls.csv')
PrimaryEmotion

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,emotion,emotion:confidence,emotion_gold,id,idiom_id,sentence
0,731671736,False,finalized,5,6/8/15 16:10,Neutral,0.3333,,1,1,How much of the forecast was genuine and how m...
1,731671737,False,finalized,5,6/12/15 14:33,Neutral,0.3885,,2,2,I did touch them one time you see but of cours...
2,731671738,False,finalized,5,6/8/15 16:10,Neutral,0.3333,,3,3,We find that choice theorists admit that they ...
3,731671739,False,finalized,5,6/8/15 16:30,Neutral,0.3690,,4,4,"Well, here I am with an olive branch."
4,731671740,False,finalized,5,6/8/15 16:30,Neutral,0.5572,,5,5,"Its rudder and fin were both knocked out, and ..."
5,731671741,False,finalized,5,6/12/15 14:27,Anger,0.6250,,6,6,Over my dead body are you arresting him.
6,731671742,False,finalized,5,6/8/15 16:30,Neutral,0.3358,,7,7,"Don't let him pick a fight now, we're almost h..."
7,731671743,False,finalized,5,6/8/15 15:47,Neutral,0.3571,,8,8,He's a good fighter but there are certain poin...
8,731671744,False,finalized,5,6/12/15 14:30,Optimism,0.6115,,10,10,Aggie finally found the cat a good home and fr...
9,731671745,False,finalized,5,6/12/15 14:35,Neutral,0.2083,,12,12,She left the citizens of Riverbank to their sh...


In [5]:
#subsetting the data, we now have a dataframe of just the emotion and the sentences. 

ProjectData = PrimaryEmotion[['emotion','sentence']]
ProjectData.head()

Unnamed: 0,emotion,sentence
0,Neutral,How much of the forecast was genuine and how m...
1,Neutral,I did touch them one time you see but of cours...
2,Neutral,We find that choice theorists admit that they ...
3,Neutral,"Well, here I am with an olive branch."
4,Neutral,"Its rudder and fin were both knocked out, and ..."


In [6]:
ProjectData.emotion.unique()

array(['Neutral', 'Anger', 'Optimism', 'Disgust', 'Sadness',
       'Anticipation', 'Aggression', 'Submission', 'Love', 'Surprise',
       'Contempt', 'Disapproval', 'Remorse', 'Ambiguous', 'Fear', 'Joy',
       'Awe', 'Trust'], dtype=object)

In [7]:
# In this first step I am changing all the words to lower case. Haven't figured out this error later
ProjectData['sentence'] = ProjectData['sentence'].apply(lambda x: " ".join(x.lower() for x in x.split()))
ProjectData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from sage.repl.ipython_kernel.kernel import SageKernel


Unnamed: 0,emotion,sentence
0,Neutral,how much of the forecast was genuine and how m...
1,Neutral,i did touch them one time you see but of cours...
2,Neutral,we find that choice theorists admit that they ...
3,Neutral,"well, here i am with an olive branch."
4,Neutral,"its rudder and fin were both knocked out, and ..."


In [8]:
# In this line I am removing punctuation.
ProjectData['sentence'] = ProjectData['sentence'].str.replace('[^\w\s]','')

ProjectData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from sage.repl.ipython_kernel.kernel import SageKernel


Unnamed: 0,emotion,sentence
0,Neutral,how much of the forecast was genuine and how m...
1,Neutral,i did touch them one time you see but of cours...
2,Neutral,we find that choice theorists admit that they ...
3,Neutral,well here i am with an olive branch
4,Neutral,its rudder and fin were both knocked out and a...


In [9]:
# Now I remove stop words.
# Note that I used a package that has predefined "stopwords" you can google the package "nltk stopwords list" to see which words are in the list. This is something we could edit to whatever we want. 

from nltk.corpus import stopwords

stop = stopwords.words('english')

ProjectData['sentence'] = ProjectData['sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

ProjectData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,emotion,sentence
0,Neutral,much forecast genuine much fixed moot point
1,Neutral,touch one time see course nothing wanted
2,Neutral,find choice theorists admit introduce style mo...
3,Neutral,well olive branch
4,Neutral,rudder fin knocked fourfootlong gash shell mea...


In [10]:
# As we know, several of our sentences contain some numeric values, which doesn't work well with what we are trying to do. So I'm going to change them to their character equilivant. For example, "1975" will become *ninteen seventy five".
# First I am creating a new column which counts how many elements in a sentence are numeric. 

#ProjectData['numerics'] = ProjectData['sentence'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
#ProjectData[['emotion','sentence','numerics']].head()

In [11]:
# This block of code creates a subset of the data which includes each row with a numeric element. As you can see there are over 90 rows. 
#numerics = (ProjectData.loc[ProjectData['numerics'] > 0])
#numerics

In [12]:
#ProjectData['sentence'].replace(to_replace = '1978', value = 'ninteen seventy eight', )

In [13]:
# This block finds the lemma of each word. 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

ProjectData['sentence'] = ProjectData['sentence'].apply(lemmatizer.lemmatize)

ProjectData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,emotion,sentence
0,Neutral,much forecast genuine much fixed moot point
1,Neutral,touch one time see course nothing wanted
2,Neutral,find choice theorists admit introduce style mo...
3,Neutral,well olive branch
4,Neutral,rudder fin knocked fourfootlong gash shell mea...


In [14]:
# Here we tokenize the data "ProjectData"
# Not yet sure how to fix the error...
ProjectData['sentence'] = ProjectData['sentence'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  IPKernelApp.launch_instance(kernel_class=SageKernel)


In [15]:
# What our dataframe currently looks like. 
ProjectData.head()

Unnamed: 0,emotion,sentence
0,Neutral,"[much, forecast, genuine, much, fixed, moot, p..."
1,Neutral,"[touch, one, time, see, course, nothing, wanted]"
2,Neutral,"[find, choice, theorists, admit, introduce, st..."
3,Neutral,"[well, olive, branch]"
4,Neutral,"[rudder, fin, knocked, fourfootlong, gash, she..."


In [43]:
Optimism1 = (ProjectData.loc[ProjectData['emotion'] == 'Optimism'])
Optimism1.head()

Unnamed: 0,emotion,sentence
8,Optimism,"[aggie, finally, found, cat, good, home, day, ..."
10,Optimism,"[must, sink, differences]"
16,Optimism,"[think, need, meeting, get, laid, need, meetin..."
21,Optimism,"[er, weve, got, friends, situation, let, bygon..."
34,Optimism,"[uk, good, books, european, space, agency]"


In [45]:
Optimism2 = ProjectData.loc[['Optimism', 'Anticipation', 'Joy'], 'emotion'])
Optimism2.head()

SyntaxError: invalid syntax (<ipython-input-45-b4178e176cb5>, line 1)

In [18]:
#OptimismWords = Optimism1['sentence']

In [19]:
#Optimism1.emotion.unique()

In [20]:
# Start with one review:
#text = "preface. Changes for seventh edition. in the preparation of this seventh edition, our goal has remained steadfast: to produce an outstanding text in mathematical statistics. in this new edition, we have added examples and exercises to help clarify the exposition. for the smae reason, we have moved some material forward. for example, we moved the discussion on some properties of linear combinations of random variables from chapter 4 to chapter 2."

# Create and generate a word cloud image:
#wordcloud = WordCloud().generate(text)
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

In [21]:

#Optimism1 = (ProjectData.loc[ProjectData['emotion'] == 'Optimism'])
#Optimism1.head()

In [22]:
#ProjectData.loc[8,:]
#Optimism1 = (ProjectData.loc[ProjectData['emotion'] == 'Optimism'])
#Optimism1.head()
#WordCloud.generate(str(Optimism1['sentence']))

#text = Optimism1['sentence']

# Create and generate a word cloud image:
#wordcloud = WordCloud.generate(text)

# Display the generated image:
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()