# Analyzing 1968 Newspaper text
This code was written to analyze data from the Ivy League student newspapers about the Columbia University student protests from April 22 - May 1, 1968. 

It finds common words overall and sorted by paper name, it runs sentiment analyses, and generally makes the data more manageable.


<strong><i>Before starting, run the source code below. Then, run any/all functions.</i></strong>

## Glossary:
<br>

<li><strong>allstories</strong> shows a dataframe of all stories and their calculated polarities and subjectivities based on TextBlob</li>

<li><strong>news_subjectivity</strong> shows mean subjectivity of texts grouped by newspaper</li>

<li><strong>news_polarity</strong> shows mean polarity of texts grouped by newspaper</li>

<li><strong>news_wordcounts</strong> shows sum of word counts in "Text" column by newspaper</li>

<li><strong>words_bydate</strong> shows number of words, based on "Word Count" column, written each day</li>

<li><strong>articles_bydate</strong> shows number of articles, grouped by newspaper, written each day</li>

<li><strong>frontpage_count</strong> shows how many of a newspaper's stories appeared on the front page</li>

<li><strong>notes</strong> shows notes and number of types of notes</li>

<li><strong>article_numbers</strong> shows the number of articles from each newspaper</li>

<li><strong>authors</strong> shows the names, newspapers, and article frequency of writers</li>

<li><strong>common_words()</strong> shows the most common words among all stories</li>

<li><strong>least_common_words()</strong> shows the least common words among all stories</li>

<li><strong>common_words_in("schoolname")</strong> find most common words from one school's newspaper</li>

<li><strong>top_distinct_words()</strong> shows words in each newspaper's top 50 most frequent words which don't overlap with the top 50 words of any other newspaper</li>

In [None]:
import pandas as pd
from pandas import DataFrame
import nltk
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import csv
import unicodedata
import argparse
from collections import Counter
import string

#Read CSV with pandas
allstories = pd.read_csv('article-data.csv', encoding='latin1')

#Create list of basic polarity and subjectivity measures using TextBlob
story_sentiments = []

#Iterate through each row to determine polarity and subjectivity
#of content in"Text" column
for row in allstories['Text']:
    blob = TextBlob(row)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    story_sentiments.append([row,polarity, subjectivity])
    
#Create dataframe from list
sentiments = DataFrame.from_records(story_sentiments)
sentiments.columns = ['Text', 'Polarity', 'Subjectivity']

#Join data back to information in original dataset
allstories = pd.merge(allstories, sentiments, how='right', on="Text")

#Drop any duplicates
allstories = allstories.drop_duplicates()

#news_subjectivity shows mean subjectivity of texts grouped by newspaper
news_subjectivity = allstories.groupby("Newspaper")["Subjectivity"].mean()

#news_polarity shows mean polarity of texts grouped by newspaper
news_polarity = allstories.groupby("Newspaper")["Polarity"].mean()

#news_wordcounts shows sum of word counts in "Text" column by newspaper
news_wordcounts = allstories.groupby("Newspaper")["Word Count"].sum()

#words_bydate shows number of words, based on "Word Count" column, written each day
words_bydate = allstories.groupby("Date")["Word Count"].sum()

#articles_bydate shows number of articles, grouped by newspaper, written each day
articles_bydate = allstories.groupby(["Newspaper", "Date"]).size()

#frontpage_count shows how many of a newspaper's stories appeared on the front page
frontpage_count = allstories.groupby(["Newspaper","Front page YN"]).size()

#notes shows notes and number of types of notes
notes = allstories.groupby(["Notes"]).size()

#article_numbers shows the number of articles from each newspaper
article_numbers = allstories.groupby("Newspaper").size()

#authors shows the names, newspapers, and article frequency of writers
authors = allstories.groupby(["Author","Newspaper"]).size()

#common_words() shows the most common words among all stories
#Code modified from user MaxU (StackOverflow)

def common_words():

    stopwords = nltk.corpus.stopwords.words('english')
    # RegEx for stopwords
    RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
    # replace '|'-->' ' 
    #make all text lowercase
    #drop stopwords and punctuation
    words = (allstories.Text
           .str.lower()
           .replace([r'\|', RE_stopwords], [' ', ''], regex=True)
           .str.replace('[^\w\s]','')
           .str.cat(sep=' ')
           .split()
    )
    
    top = 50

    # generate DF out of Counter
    common_words = pd.DataFrame(Counter(words).most_common(top),
                    columns=['Word', 'Frequency'])
    return common_words

#least_common_words() shows the least common words among all stories
def least_common_words():

    stopwords = nltk.corpus.stopwords.words('english')
    # RegEx for stopwords
    RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
    # replace '|'-->' ' 
    #make all text lowercase
    #drop stopwords and punctuation
    words = (allstories.Text
           .str.lower()
           .replace([r'\|', RE_stopwords], [' ', ''], regex=True)
           .str.replace('[^\w\s]','')
           .str.cat(sep=' ')
           .split()
    )
            
    # generate DF out of Counter (bottom 100 words)
    least_common_words = pd.DataFrame(Counter(words).most_common()[-100:],
                    columns=['Word', 'Frequency'])
    return least_common_words

#common_words_in("schoolname") find most common words from one school's newspaper
def common_words_in(schoolname):
    from collections import Counter
    import pandas as pd
    import nltk
    import string
    
    allstories.fillna(value=0, inplace=True)

    stopwords = nltk.corpus.stopwords.words('english')
    # RegEx for stopwords
    RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
    # replace '|'-->' ' and drop all stopwords
    words = (allstories.loc[allstories['Newspaper'].str.contains(str(schoolname), na=False), 'Text']
           .str.lower()
           .replace([r'\|', RE_stopwords], [' ', ''], regex=True)
           .str.replace('[^\w\s]','')
           .str.cat(sep=' ')
           .split()
    )
    top = 50

    # generate DF out of Counter
    common_in = pd.DataFrame(Counter(words).most_common(top),
                    columns=['Word', 'Frequency'])
    return common_in

#top_distinct_words() shows words in each newspaper's top 50 most frequent words which don't overlap
#with the top 50 words of any other newspaper
def top_distinct_words():
    Harvard_words = common_words_in("Harvard")
    Harvard_words['School']='Harvard'

    Princeton_words = common_words_in("Princeton")
    Princeton_words['School']='Princeton'

    Cornell_words = common_words_in("Cornell")
    Cornell_words['School']='Cornell'

    Pennsylvanian_words = common_words_in("Pennsylvanian")
    Pennsylvanian_words['School']='Penn'

    Brown_words = common_words_in("Brown")
    Brown_words['School']='Brown'

    Yale_words = common_words_in("Yale")
    Yale_words['School']='Yale'
    
    #combine all word frequencies
    all_word_freq = pd.concat([Harvard_words, Princeton_words, Cornell_words, Pennsylvanian_words, Brown_words, Yale_words])
    
    #drop duplicate words from most common
    no_duplicate_words = all_word_freq.drop_duplicates(subset=['Word'], keep=False)
    
    #sort by school
    no_duplicate_words = no_duplicate_words.sort_values(by="School")
    
    return no_duplicate_words