In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import re
from collections import Counter

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import spacy
import en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from scipy.cluster.hierarchy import ward, dendrogram


plt.style.use('seaborn-colorblind')

# Setup Pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_colwidth', 100)


In [2]:
training_set  = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")\
            .rename(columns={'essay_set': 'topic', 'domain1_score': 'target_score', 'domain2_score': 'topic2_target'})
training_set.head()

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,rater1_trait1,rater1_trait2,rater1_trait3,rater1_trait4,rater1_trait5,rater1_trait6,rater2_trait1,rater2_trait2,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computers have on people are great learning skills/affects...",4,4,,8,,,,,,,,,,,,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using computers will benefit us in many ways like talking and...",5,4,,9,,,,,,,,,,,,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more people use computers, but not everyone agrees that this...",4,3,,7,,,,,,,,,,,,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that many experts say that computers do not benifit ou...",5,5,,10,,,,,,,,,,,,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a positive effect on people. The computers connect ...",4,4,,8,,,,,,,,,,,,,,,,,,,,,


In [3]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12976 entries, 0 to 12975
Data columns (total 28 columns):
essay_id          12976 non-null int64
topic             12976 non-null int64
essay             12976 non-null object
rater1_domain1    12976 non-null int64
rater2_domain1    12976 non-null int64
rater3_domain1    128 non-null float64
target_score      12976 non-null int64
rater1_domain2    1800 non-null float64
rater2_domain2    1800 non-null float64
topic2_target     1800 non-null float64
rater1_trait1     2292 non-null float64
rater1_trait2     2292 non-null float64
rater1_trait3     2292 non-null float64
rater1_trait4     2292 non-null float64
rater1_trait5     723 non-null float64
rater1_trait6     723 non-null float64
rater2_trait1     2292 non-null float64
rater2_trait2     2292 non-null float64
rater2_trait3     2292 non-null float64
rater2_trait4     2292 non-null float64
rater2_trait5     723 non-null float64
rater2_trait6     723 non-null float64
rater3_trait1     128

In [4]:
training_set.groupby('topic').agg('count').plot.bar(y='essay', rot=0)
plt.title('Essay count by topic #')
plt.ylabel('Count')


<IPython.core.display.Javascript object>

Text(0,0.5,'Count')

In [5]:
# Count characters and words for each essay
training_set['char_len'] = training_set['essay'].str.len()
training_set['word_count'] = training_set['essay'].str.strip().str.split('[\W_]+').str.len()

In [6]:
training_set.hist(column='char_len', by='topic', bins=25, sharey=True, sharex=True, layout=(2, 4), figsize=(7,4), rot=0) 
plt.suptitle('Essay character count by topic #')
plt.xlabel('Number of characters')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [7]:
training_set.hist(column='word_count', by='topic', bins=25, sharey=True, sharex=True, layout=(2, 4), figsize=(7,4), rot=0) 
plt.suptitle('Word count by topic #')
plt.xlabel('Number of words')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [8]:
training_set.groupby(['topic'])['target_score'].agg(['min','max','count','nunique'])

Unnamed: 0_level_0,min,max,count,nunique
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,12,1783,11
2,1,6,1800,6
3,0,3,1726,4
4,0,3,1770,4
5,0,4,1805,5
6,0,4,1800,5
7,2,24,1569,23
8,10,60,723,34


In [9]:
topic_number = 0
fig, ax = plt.subplots(4,2, figsize=(7,10))
for i in range(4):
    for j in range(2):
        topic_number += 1
        sns.violinplot(x='target_score', y='char_len', data=training_set[training_set['topic'] == topic_number], ax=ax[i,j])
        ax[i,j].set_title('Topic %i' % topic_number)
ax[3,0].locator_params(nbins=10)
ax[3,1].locator_params(nbins=10)
plt.suptitle('Character count by score')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [10]:
topic_number = 0
fig, ax = plt.subplots(4,2, figsize=(7,10))
for i in range(4):
    for j in range(2):
        topic_number += 1
        sns.violinplot(x='target_score', y='word_count', data=training_set[training_set['topic'] == topic_number], ax=ax[i,j])
        ax[i,j].set_title('Topic %i' % topic_number)
ax[3,0].locator_params(nbins=10)
ax[3,1].locator_params(nbins=10)
plt.suptitle('Word count by score')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [11]:
topic_number = 0
fig, ax = plt.subplots(4,2, figsize=(7,7), sharey=False)
for i in range(4):
    for j in range(2):
        topic_number += 1
        training_set[training_set['topic'] == topic_number]\
            .groupby('target_score')['essay_id']\
            .agg('count')\
            .plot.bar(ax=ax[i, j], rot=0)
        ax[i,j].set_title('Topic %i' % topic_number)
ax[3,0].locator_params(nbins=10)
ax[3,1].locator_params(nbins=10)
plt.suptitle('Histograms of essay scores')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [32]:
# Runtime approximately 1-2 hours

# # Load English tokenizer, tagger, parser, NER and word vectors
# nlp = spacy.load('en_core_web_sm')

# tokens = []
# lemma = []
# pos = []

# for essay in nlp.pipe(training_set['essay'].astype('unicode').values, batch_size=50,
#                         n_threads=3):
#     if essay.is_parsed:
#         tokens.append([n.text for n in essay])
#         lemma.append([n.lemma_ for n in essay])
#         pos.append([n.pos_ for n in essay])
#     else:
#         # We want to make sure that the lists of parsed results have the
#         # same number of entries of the original Dataframe, so add some blanks in case the parse fails
#         tokens.append(None)
#         lemma.append(None)
#         pos.append(None)

# training_set['tokens'] = tokens
# training_set['lemma'] = lemma
# training_set['pos'] = pos

# training_set.to_pickle('training_set.pkl')

In [12]:
training_set = pd.read_pickle('training_set.pkl')
training_set.iloc[0]

essay_id                                                                                                            1
topic                                                                                                               1
essay             Dear local newspaper, I think effects computers have on people are great learning skills/affects...
rater1_domain1                                                                                                      4
rater2_domain1                                                                                                      4
rater3_domain1                                                                                                    NaN
target_score                                                                                                        8
rater1_domain2                                                                                                    NaN
rater2_domain2                                          

In [13]:
# load nltk's SnowballStemmer as variabled 'stemmer'
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
tokenize_and_stem(training_set.iloc[0]['essay'])

['dear',
 'local',
 'newspap',
 'i',
 'think',
 'effect',
 'comput',
 'have',
 'on',
 'peopl',
 'are',
 'great',
 'learn',
 'skills/affect',
 'becaus',
 'they',
 'give',
 'us',
 'time',
 'to',
 'chat',
 'with',
 'friends/new',
 'peopl',
 'help',
 'us',
 'learn',
 'about',
 'the',
 'globe',
 'astronomi',
 'and',
 'keep',
 'us',
 'out',
 'of',
 'trobl',
 'thing',
 'about',
 'dont',
 'you',
 'think',
 'so',
 'how',
 'would',
 'you',
 'feel',
 'if',
 'your',
 'teenag',
 'is',
 'alway',
 'on',
 'the',
 'phone',
 'with',
 'friend',
 'do',
 'you',
 'ever',
 'time',
 'to',
 'chat',
 'with',
 'your',
 'friend',
 'or',
 'buis',
 'partner',
 'about',
 'thing',
 'well',
 'now',
 'there',
 "'s",
 'a',
 'new',
 'way',
 'to',
 'chat',
 'the',
 'comput',
 'their',
 'plenti',
 'of',
 'site',
 'on',
 'the',
 'internet',
 'to',
 'do',
 'so',
 'organization1',
 'organization2',
 'caps1',
 'facebook',
 'myspac',
 'ect',
 'just',
 'think',
 'now',
 'while',
 'your',
 'set',
 'up',
 'meet',
 'with',
 'your',

In [14]:
essay1 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 1)]['lemma'].astype('unicode').sum()[:400000]
essay2 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 2)]['lemma'].astype('unicode').sum()[:400000]
essay3 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 3)]['lemma'].astype('unicode').sum()[:400000]
essay4 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 4)]['lemma'].astype('unicode').sum()[:400000]
essay5 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 5)]['lemma'].astype('unicode').sum()[:400000]
essay6 = training_set[(training_set['topic'] == 2) & (training_set['target_score'] == 6)]['lemma'].astype('unicode').sum()[:400000]

In [15]:
essays = [essay1, essay2, essay3, essay4, essay5, essay6]

In [16]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(essays) #fit the vectorizer to essays

print(tfidf_matrix.shape)

(6, 12169)


In [17]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [18]:
kmeans = KMeans(n_clusters=6)

kmeans.fit(tfidf_matrix)

clusters = kmeans.labels_.tolist()

In [19]:
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
labels = [0, 1, 2, 3, 4, 5]

fig, ax = plt.subplots(figsize=(6, 6))
ax = dendrogram(linkage_matrix, truncate_mode='level', p=0, show_contracted=False, orientation="right", labels=clusters);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)

plt.tight_layout() #show plot with tight layout
plt.show()

  return linkage(y, method='ward', metric='euclidean')


<IPython.core.display.Javascript object>

In [20]:
# principal component analysis

pca = PCA(n_components=2)

components = pca.fit_transform(dist)

fig, ax = plt.subplots(figsize=(5,5))

g = ax.scatter(components[:,0], components[:,1], label=labels, c=labels, s=100, cmap=plt.cm.get_cmap('tab10', 6))
ax.set_title('Principal Component Analysis')
ax.set_xlabel('Principal component 1')
ax.set_ylabel('Principal component 2')

# label points
for i, txt in enumerate(labels):
    ax.annotate(txt, (components[i,0],components[i,1]), xytext=(5, 5), textcoords='offset points',)

# This function formatter will replace integers with target names
formatter = plt.FuncFormatter(lambda val, loc: labels[val])
plt.colorbar(g, ax=ax, ticks=[0, 1, 2, 3, 4, 5], format=formatter);

# # Set the clim so that labels are centered on each block
g.set_clim(vmin=-0.5, vmax=5.5)

# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [21]:
essaybag = training_set[training_set['topic'] == 2]['lemma'].astype('unicode').tolist()
labels = training_set[training_set['topic'] == 2]['target_score'].tolist()

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(essaybag) #fit the vectorizer to essays

print(tfidf_matrix.shape)

dist = 1 - cosine_similarity(tfidf_matrix)

# principal component analysis

pca = PCA(n_components=2)

components = pca.fit_transform(dist)



(1800, 146)


In [30]:
label_set = np.array(list(set(labels)))

fig, ax = plt.subplots(figsize=(8,8))

g = ax.scatter(components[:,0], components[:,1], label=labels, c=labels, s=5, cmap=plt.cm.get_cmap('tab10', 6))
ax.set_title('Principal Component Analysis')
ax.set_xlabel('Principal component 1')
ax.set_ylabel('Principal component 2')

# # label points
# for i, txt in enumerate(labels):
#     ax.annotate(txt, (components[i,0],components[i,1]), xytext=(5, 5), textcoords='offset points',)

# # This function formatter will replace integers with target names
formatter = plt.FuncFormatter(lambda val, loc: label_set[val])
plt.colorbar(g, ax=ax, ticks=[0, 1, 2, 3, 4, 5], format=formatter);

# # # Set the clim so that labels are centered on each block
g.set_clim(vmin=-0.5, vmax=5.5)

# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

<IPython.core.display.Javascript object>

In [31]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')

# Determine semantic similarities
doc1 = nlp(essay1)
doc2 = nlp(essay2)
similarity = doc1.similarity(doc2)
print('')
print('similarity:')
print(similarity)


similarity:
0.9997738567924009


In [45]:
training_set[(training_set['essay'].str.len() > 1000)\
             & (training_set['essay'].str.len() < 5050)\
             & (training_set['topic'] == 2)\
             & (training_set['target_score'] == 6)]\
            ['essay']

2115    Cooling the @CAPS1: @CAPS2 and @CAPS3 From Book @PERSON1 _________     Censorship in public libr...
2233    First comes censorship along with its totalitarian prowess, second comes the downfall of humanit...
2866    What if someone told you that you couldn't wear a yellow shirt because it was offensive? What if...
2913    Dear @ORGANIZATION1,     @CAPS1 I was growing up in my small little neighborhood, life could not...
3160    The common phrase 'knowledge is power' accurately paints the situation at hand. Through censorsh...
3437    @CAPS1 happened to using books and films as simple entertainment? Why are we now cautious of @CA...
Name: essay, dtype: object

In [225]:
sample1 = training_set.loc[2115, 'essay'] 
sample2 = training_set.loc[2913, 'essay'] 

In [229]:
# Tokenize essay
tokens = word_tokenize(sample1)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
stop_words = set(stopwords.words('english'))
no_stops = [t for t in alpha_only if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
# Create the bag-of-words: bow
bow = Counter(lemmatized)
# Print the 10 most common tokens
print(bow.most_common(10))

[('book', 16), ('library', 14), ('knowledge', 6), ('must', 6), ('censorship', 5), ('find', 5), ('one', 4), ('point', 4), ('value', 4), ('would', 3)]


In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rujjn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')

# Process whole documents
text = (sample1)
doc = nlp(sample1)
print(doc)

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

# Determine semantic similarities
doc1 = nlp(sample1)
doc2 = nlp(sample2)
similarity = doc1.similarity(doc2)
print('')
print('similarity:')
print(similarity)

Cooling the @CAPS1: @CAPS2 and @CAPS3 From Book @PERSON1 _________     Censorship in public libraries would be an insult to the institution. A library, a citadel of knowledge, should not be bound by a society as to what knowledge it can convey. One point of a library, the destination of researchers, is to offer information that must be sought: for those who cannot find the information in their homes without fear, for those who cannot ask their questions without retribution, the library stands as a pillar of safety. Books are incapable of judgment.     To censor the books held in a library is to chip away at human progress. A freedom of thought and a freedom of press, present within our nation and increasing in presence around the @CAPS5, precludes the idea of this censorship.     If any book is subject to criticism or opposition, are not all books worthy of the same? To value one opinion or perspective less than another is to debase the entire culture from which libraries have arisen. 

In [224]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
from spacy.tokens import Span
doc = nlp(u'FB is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE
FB 0 2 ORG


In [72]:
from keras.models import Sequential