In [1]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk import FreqDist
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer
import altair as alt
from vega_datasets import data

[nltk_data] Downloading package punkt to /Users/fafnir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fafnir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
humanist_vols = pd.read_csv("web_scraped_humanist_listserv.csv")
humanist_vols.head()

Unnamed: 0,dates,text
0,1987-1988,From: MCCARTY@UTOREPAS\nSubject: \nDate: 12 Ma...
1,1988-1989,From: Sebastian Rahtz \nSubject: C++ and Gnu o...
2,1989-1990,From: Willard McCarty \nSubject: Happy Birthda...
3,1990-1991,From: Elaine Brennan & Allen Renear \nSubject:...
4,1991-1992,From: Elaine Brennan & Allen Renear \nSubject:...


### Using the tfidf_df from our Introduction to Unstructured Data and Text Analysis Assignment, try and use Altair to create visualizations to answer the following questions:

* What are the top ten unique TFIDF terms across our entire corpus and what score do they each have?
* What are the top five TFIDF terms for each volume?


In [5]:
def stem_words(row):
    stemmed_words = ''
    for token in row.text.split(' '):
        stemmed_words += porter.stem(token) + ' '
    return stemmed_words

humanist_vols['year_start'] = humanist_vols['dates'].str.split('-').str[0]
humanist_vols['year_end'] = humanist_vols['dates'].str.split('-').str[1]
humanist_vols['volume_size'] = humanist_vols['text'].str.count('\n')
humanist_vols = humanist_vols.iloc[:10, :]
humanist_vols

humanist_vols['stemmed_text'] = humanist_vols.apply(stem_words, axis=1)

documents = humanist_vols.text.tolist()
vectorizer = TfidfVectorizer(max_df=.7, min_df=1, stop_words=stop_words)

transformed_documents = vectorizer.fit_transform(documents)
transformed_documents_as_array = transformed_documents.toarray()
dates = humanist_vols.dates.tolist()
tfidf_results = []
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=[
                                              'term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['dates'] = dates[counter]
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)
tfidf_df = tfidf_df[(tfidf_df['term'] != '2002') & (tfidf_df['term'] != '2004') & (
    tfidf_df['term'] != '2006') & (tfidf_df['term'] != '2007') & (tfidf_df['term'] != '2008')]
tfidf_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  humanist_vols['stemmed_text'] = humanist_vols.apply(stem_words, axis=1)


Unnamed: 0,term,score,dates
0,utorepas,0.750031,1987-1988
0,http,0.60029,1996-1997
0,http,0.573855,1995-1996
1,www,0.509069,1995-1996
1,1997,0.488162,1996-1997
2,www,0.451835,1996-1997
0,gopher,0.390306,1993-1994
2,1996,0.386933,1995-1996
0,www,0.37121,1994-1995
3,html,0.362353,1995-1996


In [6]:
a = tfidf_df[0:40].term.unique().tolist()
a_10 = a[:10]

In [7]:
a_10

['utorepas',
 'http',
 'www',
 '1997',
 'gopher',
 '1996',
 'html',
 'saddam',
 'uottawa',
 'astra']

In [8]:
top_term = tfidf_df[tfidf_df.term.isin(a_10)]

In [9]:
alt.Chart(top_term).mark_bar().encode(
    x='term',
    y='score',
    color=alt.Color('term', scale=alt.Scale(scheme='accent'), sort=alt.Sort()),
)


In [10]:
alt.Chart(top_term).mark_bar().encode(
    x='dates',
    y='score',
    color=alt.Color('term', scale=alt.Scale(scheme='accent'), sort=alt.Sort()),
)
