## NLP Data Analysis

In [311]:
import re
import pandas as pd
from collections import Counter
from nltk.stem.porter import PorterStemmer

In [286]:
def sentence_splitter(paragraph):
    sentences = re.split('(?<=\.|\!|\?)\"? ', paragraph)
    new_sentences = []
    for i,sentence in enumerate(sentences):
        if i == 0:
            new_sentences.append(sentence)
        elif sentence and sentence[0].islower():
            last = new_sentences.pop()
            new_sentences.append(last + sentence)
        elif i < len(sentences) - 1 and sentence and sentence[-1] == '.' and len(sentence.split(' ')[-1][0]) < 4 and sentence.split(' ')[-1][0].isupper():
            sentences[i+1] = sentence + sentences[i+1]
        elif sentence:
            new_sentences.append(sentence)
    return new_sentences

In [287]:
def tokenize(sentence):
    words = sentence.split(' ')
    tokens = []
    word_tokens = []
    for word in words:
        while word and word[-1] in ",:;'().?!":
            word = word.split(word[-1])[0]
        tokens.append(word)
    for token in tokens:
        if re.match('^[a-zA-Z0-9]+$', token):
            word_tokens.append(token)
    return word_tokens

In [356]:
filenames = ['normal', 'simple']
for filename in filenames:
    print(filename + ":")
    with open('assignment1_resources/' + filename + '.txt', 'r') as f:
        text = f.read() 
        periods = text.count(".")
        commas = text.count(",")
        question_marks = text.count("?")
        ex_points = text.count("!")
        colons = text.count(":")
        semicolons = text.count(";")

    articles = [x.split('</TITLE>\n')[1] for x in text.split('<TITLE>')[1:]]
    paragraphs = []
    sentences = []
    words = []
    for article in articles:
        pars = article.split('\n\n')[:-1]
        paragraphs = paragraphs + pars
        for p in pars:
            sentences.append(sentence_splitter(p))
    num_sentences = 0
    for s in sentences:
        num_sentences += len(s)
        for sent in s:
            words.append(tokenize(sent))
    num_words = 0
    for w in words:
        num_words += len(w)
    all_words = Counter([j for i in words for j in i])
    unique_words = set([j for i in words for j in i])
    unique_nocase_words = set([j.lower() for i in words for j in i])
    word_lengths = [len(j) for i in words for j in i]
    all_tokens = [j for i in words for j in i]
    p_stemmer = PorterStemmer()
    stemmed_tokens = [p_stemmer.stem(i.lower()) for i in all_tokens]
    print('total paragraphs:', len(paragraphs))
    print('average paragraphs per article:', len(paragraphs)/len(articles))
    print('total sentences:', num_sentences)
    print('average sentences per article:', num_sentences/len(articles))
    print('total words:', num_words)
    print('average words per article:', num_words/len(articles))
    print('average words per sentence:', num_words/num_sentences)
    print('unique words:', len(unique_words))
    print('unique nocase words:', len(unique_nocase_words))
    print(all_words.most_common(10))
    print('average word length:', sum(word_lengths)/len(word_lengths))
    print('num words per period:', num_words/periods)
    print('num words per comma:', num_words/commas)
    print('num words per question mark:', num_words/question_marks)
    print('num words per exclamation point:', num_words/ex_points)
    print('num words per semicolon:', num_words/semicolons)
    print('num words per colon:', num_words/colons)
    print('unique stemmed tokens:', len(set(stemmed_tokens)), '\n')

normal:
total paragraphs: 18087
average paragraphs per article: 36.174
total sentences: 60031
average sentences per article: 120.062
total words: 1369461
average words per article: 2738.922
average words per sentence: 22.81256350885376
unique words: 66776
unique nocase words: 58283
[('the', 94338), ('of', 57221), ('and', 43697), ('in', 33617), ('to', 32292), ('a', 27603), ('is', 16796), ('as', 13239), ('was', 12806), ('The', 12301)]
average word length: 4.998769588911258
num words per period: 19.08204327894436
num words per comma: 14.920801464339412
num words per question mark: 6490.336492890995
num words per exclamation point: 8950.725490196079
num words per semicolon: 402.5458553791887
num words per colon: 546.0370813397129
unique stemmed tokens: 41727 

simple:
total paragraphs: 2189
average paragraphs per article: 4.378
total sentences: 6991
average sentences per article: 13.982
total words: 113287
average words per article: 226.574
average words per sentence: 16.204691746531253
un

In [357]:
articles[:5]

['Anarchism is the anti-authoritarian political belief that people can organize themselves without having a state or a government in power. Anarchists also believe that participation should never be forced by other people.\n\nAnarchism is "a cluster of doctrines and attitudes centered on the belief that government is both harmful and unnecessary." The term "anarchism" derives from the Greek αναρχία . \n\nIn the common language, the word anarchy is often used to describe chaos or anomie. However, anarchists do not promote this. Rather, they define "anarchy" as a way of relations between people. Once put into place these relations work on their own.\n\n Individual freedom, voluntary association and opposition to the state, are important beliefs of anarchism. There are also big differences between anarchist philosophies on things like whether violence can be used to bring anarchy; the best type of economy; the relationship between technology and hierarchy; the idea of equality; and the us

In [364]:
paragraphs[:3]

['Anarchism is the anti-authoritarian political belief that people can organize themselves without having a state or a government in power. Anarchists also believe that participation should never be forced by other people.',
 'Anarchism is "a cluster of doctrines and attitudes centered on the belief that government is both harmful and unnecessary." The term "anarchism" derives from the Greek αναρχία . ',
 'In the common language, the word anarchy is often used to describe chaos or anomie. However, anarchists do not promote this. Rather, they define "anarchy" as a way of relations between people. Once put into place these relations work on their own.']

In [365]:
articles[:1]

['Anarchism is the anti-authoritarian political belief that people can organize themselves without having a state or a government in power. Anarchists also believe that participation should never be forced by other people.\n\nAnarchism is "a cluster of doctrines and attitudes centered on the belief that government is both harmful and unnecessary." The term "anarchism" derives from the Greek αναρχία . \n\nIn the common language, the word anarchy is often used to describe chaos or anomie. However, anarchists do not promote this. Rather, they define "anarchy" as a way of relations between people. Once put into place these relations work on their own.\n\n Individual freedom, voluntary association and opposition to the state, are important beliefs of anarchism. There are also big differences between anarchist philosophies on things like whether violence can be used to bring anarchy; the best type of economy; the relationship between technology and hierarchy; the idea of equality; and the us

In [366]:
text[:100]

'<TITLE>Anarchism</TITLE>\nAnarchism is the anti-authoritarian political belief that people can organi'