# FNLP - Assignment 1

In [7]:
#!/usr/bin/python
# coding: utf-8

import nltk
import sys

# Import numpy as we will need it to calculate mean and standard deviation
import numpy as np

# Import the Presidential inaugural speeches, Brown and CONLL corpora
# conll2007 is not installed by default
nltk.data.path.append('/group/sgwater/data/nltk_data')
from nltk.corpus import inaugural, brown, conll2007

# directory with special twitter module
sys.path.extend(['/group/ltg/projects/fnlp', '/group/ltg/projects/fnlp/packages_2.6'])

# Import the Twitter corpus and LgramModel
from twitter import xtwc, LgramModel

# Stopword list
from nltk.corpus import stopwords

twitter_file_ids = xtwc.fileids()[11:13]

In [8]:
#################### SECTION A: COMPARING CORPORA ####################

##### Solution for question 1 #####

def get_corpus_tokens(corpus, list_of_files):
    '''Get the tokens from (part of) a corpus

    :type corpus: nltk.corpus.CorpusReader
    :param corpus: An NLTK corpus
    :type list_of_files: list(file)
    :param list_of_files: files to read from
    :rtype: list(str)
    :return: the tokenised contents of the files'''

    # Construct "corpus_tokens" (a list of all tokens in the corpus)
    corpus_tokens = [w.lower() for w in corpus.words(list_of_files)]

    # Return the list of corpus tokens
    return corpus_tokens

def q1(corpus, list_of_files):
    '''Compute the average word type length from (part of) a corpus

    :type corpus: nltk.corpus.CorpusReader
    :param corpus: An NLTK corpus
    :type list_of_files: list(str)
    :param list_of_files: names of files to read from
    :rtype: float
    :return: the average word type length over all the files'''

    # Get a list of all tokens in the corpus
    corpus_tokens = get_corpus_tokens(corpus, list_of_files)

    # Construct a list that contains the lengths for each word
    #  type in the document
    type_lengths = [len(w) for w in set(corpus_tokens)]  # already lowercase

    # Find the average word type length
    avg_type_length = np.mean(type_lengths)

    # Return the average word type length of the document
    return avg_type_length

##### Solution for question 2 #####

def q2():
    '''Question: Why might the average type length be greater for
       twitter data?

    :rtype: str
    :return: your answer'''

    return """
    The Twitter corpus contains words where a single letter is emphasized for emphasis (\"blahhhhhh...\" or \"jealoussssss...\"), URLs, repeated words (\"passeipasseipasseipassei...\"), and other \"words\" that are not used in normal speech. In contrast, the longest inaugural word is \"antiphilosophists\".
    """

In [9]:
# Get top x longest words
def get_longest_x(corpus, list_of_files, x):
    words = get_corpus_tokens(corpus, list_of_files)
    words.sort(key = len)
    return words[-x:]

x = 10
print get_longest_x(inaugural, inaugural.fileids(), x)
print get_longest_x(xtwc, twitter_file_ids, x)

[u'antiphilosophists', u'misrepresentation', u'contradistinction', u'misrepresentation', u'instrumentalities', u'instrumentalities', u'instrumentalities', u'instrumentalities', u'instrumentalities', u'instrumentalities']
[u'blahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh', u'http://g1.globo.com/noticias/planetabizarro/0,,mul1466801-6091,00-para+salvar+namoro+chinesa+quer+operar+e+virar+sosia+de+jessica+alba.html', u'jealousssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss', u'http://www.formaturismo.com.br/portal/default.asp?acta=7&ano_selecionadovc=2010&destinoid=54&anovc=2010&galeriaid=442&bt_ok.x=14&bt_ok.y=19', u'\u4f50\u5929\u3055\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u3042\u

In [10]:
### Question 1
print "*** Question 1 ***"
answer1a = q1(inaugural,inaugural.fileids())
print "Average token length for inaugural corpus: %.2f"%answer1a
answer1b = q1(xtwc,twitter_file_ids)
print "Average token length for twitter corpus: %.2f"%answer1b
### Question 2
print "*** Question 2 ***"
answer2 = q2()
print answer2

*** Question 1 ***
Average token length for inaugural corpus: 7.77
Average token length for twitter corpus: 11.25
*** Question 2 ***

    The Twitter corpus contains words where a single letter is emphasized for emphasis ("blahhhhhh..." or "jealoussssss..."), URLs, repeated words ("passeipasseipasseipassei..."), and other "words" that are not used in normal speech. In contrast, the longest inaugural word is "antiphilosophists".
    


In [11]:
#################### SECTION B: DATA IN THE REAL WORLD ####################

##### Solution for question 3 #####

def q3(corpus, list_of_files, x):
    '''Tabulate and plot the top x most frequently used word types
       and their counts from the specified files in the corpus

    :type corpus: nltk.corpus.CorpusReader
    :param corpus: An NLTK corpus
    :type list_of_files: list(str)
    :param list_of_files: names of files to read from
    :rtype: list(tuple(string,int))
    :return: top x word types and their counts from the files'''

    # Get a list of all tokens in the corpus
    corpus_tokens = get_corpus_tokens(corpus, list_of_files)

    # Construct a frequency distribution over the lowercased tokens in the document
    fd_doc_types = nltk.FreqDist(corpus_tokens)  # already lowercase

    # Find the top x most frequently used types in the document
    top_types = fd_doc_types.most_common(x)

    # Produce a plot showing the top x types and their frequencies
    fd_doc_types.plot(x)

    return top_types

##### Solution for question 4 #####

def q4(corpus_tokens):
    '''Clean a list of corpus tokens

    :type corpus_tokens: list(str)
    :param corpus_tokens: (lowercased) corpus tokens
    :rtype: list(str)
    :return: cleaned list of corpus tokens'''

    stops = list(stopwords.words("english"))

    # If token is alpha-numeric and NOT in the list of stopwords,
    #  add it to cleaned_tokens
    cleaned_corpus_tokens = [w for w in corpus_tokens if w.isalnum() and w not in stops]

    return cleaned_corpus_tokens

##### Solution for question 5 #####

def q5(cleaned_corpus_tokens, x):
    '''Tabulate and plot the top x most frequently used word types
       and their counts from the corpus tokens

    :type corpus_tokens: list(str)
    :param corpus_tokens: (cleaned) corpus tokens
    :rtype: list(tuple(string,int))
    :return: top x word types and their counts from the files'''

    # Construct a frequency distribution over the lowercased tokens in the document
    fd_doc_types = nltk.FreqDist(cleaned_corpus_tokens)  # already lowercase

    # Find the top x most frequently used types in the document
    top_types = fd_doc_types.most_common(x)

    # Produce a plot showing the top x types and their frequencies
    fd_doc_types.plot(x)

    # Return the top x most frequently used types
    return top_types

##### Solution for question 6 #####

def q6():
    '''Problem: URLs in twitter data

    :rtype: str
    :return: your answer'''

    return """
    The biggest problem I found was the existence of very common words in languages that are not English. For example, the second and third most common words are 'de' and 'que,' which are Spanish for 'of' and 'what' (respectively). I assume that if we included Spanish stopwords or separated the tweets by language, then this problem would be alleviated.
    """

In [12]:
### Question 3
print "*** Question 3 ***"
print "Most common 50 types for the inaugural corpus:"
answer3a = q3(inaugural,inaugural.fileids(),50)
print answer3a
print "Most common 50 types for the twitter corpus:"
answer3b = q3(xtwc,twitter_file_ids,50)
print answer3b
### Question 4
print "*** Question 4 ***"
corpus_tokens = get_corpus_tokens(inaugural,inaugural.fileids())
answer4a = q4(corpus_tokens)
print "Inaugural Speeches:"
print "Number of tokens in original corpus: %s"%len(corpus_tokens)
print "Number of tokens in cleaned corpus: %s"%len(answer4a)
print "First 100 tokens in cleaned corpus:"
print answer4a[:100]
print "-----"
corpus_tokens = get_corpus_tokens(xtwc,twitter_file_ids)
answer4b = q4(corpus_tokens)
print "Twitter:"
print "Number of tokens in original corpus: %s"%len(corpus_tokens)
print "Number of tokens in cleaned corpus: %s"%len(answer4b)
print "First 100 tokens in cleaned corpus:"
print answer4b[:100]
### Question 5
print "*** Question 5 ***"
print "Most common 50 types for the cleaned inaugural corpus:"
answer5a = q5(answer4a, 50)
print answer5a
print "Most common 50 types for the cleaned twitter corpus:"
answer5b = q5(answer4b, 50)
print answer5b
### Question 6
print "*** Question 6 ***"
answer6 = q6()
print answer6

*** Question 3 ***
Most common 50 types for the inaugural corpus:
[(u'the', 9906), (u'of', 6986), (u',', 6840), (u'and', 5139), (u'.', 4676), (u'to', 4432), (u'in', 2749), (u'a', 2193), (u'our', 2058), (u'that', 1726), (u'we', 1625), (u'be', 1460), (u'is', 1416), (u'it', 1367), (u'for', 1154), (u'by', 1066), (u'which', 1002), (u'have', 997), (u'with', 937), (u'as', 931), (u'not', 924), (u'will', 851), (u'i', 832), (u'this', 812), (u'all', 794), (u'are', 779), (u'their', 738), (u'but', 628), (u'has', 612), (u'government', 593), (u'its', 565), (u'people', 563), (u'from', 551), (u';', 544), (u'or', 542), (u'on', 520), (u'my', 491), (u'been', 482), (u'can', 465), (u'us', 455), (u'no', 453), (u'they', 440), (u'so', 383), (u'an', 380), (u'upon', 369), (u'--', 363), (u'who', 361), (u'must', 346), (u'at', 341), (u'may', 334)]
Most common 50 types for the twitter corpus:
[(u'.', 108250), (u':', 80794), (u',', 79419), (u'i', 57861), (u'!', 52645), (u"'", 49816), (u'the', 48974), (u'a', 48204), (

In [13]:
help(LgramModel)

Help on class LgramModel in module twitter:

class LgramModel(nltkx.model.ngram.NgramModel)
 |  Method resolution order:
 |      LgramModel
 |      nltkx.model.ngram.NgramModel
 |      nltkx.model.api.ModelI
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs)
 |      NgramModel (q.v.) slightly tweaked to produce char-grams,
 |      not word-grams, with a WittenBell default estimator
 |      
 |      :param train: List of strings, which will be converted to list of lists of characters, but more efficiently
 |      :type train: iter(str)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from nltkx.model.ngram.NgramModel:
 |  
 |  __contains__(self, item)
 |  
 |  __getitem__(self, item)
 |  
 |  __repr__(self)
 |  
 |  __str__(self)
 |      x.__str__() <==> str(x)
 |  
 |  __unicode__ = __str__(...)
 |      x.__s

In [14]:
#################### SECTION C: LANGUAGE IDENTIFICATION ####################

##### Solution for question 7 #####

def q7(corpus):
    '''Build a bigram letter language model using LgramModel
       based on the all-alpha subset the entire corpus

    :type corpus: nltk.corpus.CorpusReader
    :param corpus: An NLTK corpus
    :rtype: LgramModel
    :return: A padded letter bigram model based on nltk.model.NgramModel'''

    corpus_tokens = [w.lower() for w in corpus.words() if w.isalpha()]

    bigram_model = LgramModel(2, corpus_tokens, pad_left=True, pad_right=True)

    # Return the letter bigram LM
    return bigram_model

##### Solution for question 8 #####

def q8(file_name,bigram_model):
    '''Using a character bigram model, compute sentence entropies
       for a subset of the tweet corpus, removing all non-alpha tokens and
       tweets with less than 5 all-alpha tokens

    :type file_name: str
    :param file_name: twitter file to process
    :rtype: list(tuple(float,list(str)))
    :return: ordered list of average entropies and tweets'''

    list_of_tweets = xtwc.sents(file_name)

    cleaned_list_of_tweets = []
    for tweet in list_of_tweets:
        cleaned_tweet = [w.lower() for w in tweet if w.isalpha()]
        if len(cleaned_tweet) >= 5:
            cleaned_list_of_tweets.append(cleaned_tweet)
    
    # For each tweet in the cleaned corpus, compute the average word
    #  entropy, and store in a list of tuples of the form: (entropy,tweet)
    list_of_tweets_and_entropies = []
    for tweet in cleaned_list_of_tweets:
        e = np.mean([bigram_model.entropy(w, pad_left=True, pad_right=True, perItem=True) for w in tweet])
        list_of_tweets_and_entropies.append((e, tweet))
            
    
    # Sort the list of (entropy,tweet) tuples by entropy
    list_of_tweets_and_entropies = sorted(list_of_tweets_and_entropies, key=lambda tup: tup[0])

    # Return the sorted list of tuples
    return list_of_tweets_and_entropies

##### Solution for question 9 #####

def q9():
    '''Question: What differentiates the beginning and end of the list
       of tweets and their entropies?

    :rtype: str
    :return: your answer'''

    return """...
    """

##### Solution for question 10 #####

# Output:
def q10(list_of_tweets_and_entropies):
    '''Compute entropy mean, standard deviation and using them,
       likely non-English tweets in the all-ascii subset of list of tweets
       and their biletter entropies

    :type list_of_tweets_and_entropies: list(tuple(float,list(str)))
    :param list_of_tweets_and_entropies: tweets and their
                                    internal average biletter entropy
    :rtype: tuple(float, float, list(tuple(float,list(str)))
    :return: mean, standard deviation, ascii tweets and entropies,
             not-English tweets and entropies'''

    # Find the "ascii" tweets - those in the lowest-entropy 90%
    #  of list_of_tweets_and_entropies
    threshold = int(len(list_of_tweets_and_entropies) * 0.9)
    list_of_ascii_tweets_and_entropies = list_of_tweets_and_entropies[:threshold]

    # Extract a list of just the entropy values
    list_of_entropies = [tup[0] for tup in list_of_ascii_tweets_and_entropies]

    # Compute the mean of entropy values for "ascii" tweets
    mean = np.mean(list_of_entropies)

    # Compute their standard deviation
    standard_deviation = np.std(list_of_entropies)

    # Get a list of "probably not English" tweets, that is, "ascii"
    # tweets with an entropy greater than (mean + (0.674 * std_dev))
    threshold = mean + (0.674 * standard_deviation)
    list_of_not_English_tweets_and_entropies = [tup for tup in list_of_ascii_tweets_and_entropies if tup[0] > threshold]
    
    # sort...
    list_of_not_English_tweets_and_entropies = sorted(list_of_not_English_tweets_and_entropies, key=lambda tup: tup[0])

    # Return the mean and standard_deviation values and the two lists
    return (mean, standard_deviation,
            list_of_ascii_tweets_and_entropies,
            list_of_not_English_tweets_and_entropies)

##### Solution for question 11 #####

def q11(list_of_files, list_of_not_English_tweets_and_entropies):
    '''Build a padded spanish bigram letter bigram model and use it
       to re-sort the probably-not-English data

    :type list_of_files: list(str)
    :param list_of_files: spanish corpus files
    :type list_of_tweets_and_entropies: list(tuple(float,list(str)))
    :param list_of_tweets_and_entropies: tweets and their
                                    internal average biletter entropy
    :rtype: list(tuple(float,list(str)))
    :return: probably-not-English tweets and _spanish_ entropies'''

    # Build a bigram letter language model using "LgramModel"
    corpus_tokens = [w.lower() for w in conll2007.words(list_of_files) if w.isalpha()]
    bigram_model = LgramModel(2, corpus_tokens, pad_left=True, pad_right=True)

    # Compute the entropy of each of the tweets in list (list_of_not_English_tweets_and_entropies) using the new bigram letter language model
    # list_of_not_English_tweets_and_entropies = [(bigram_model.entropy(tup[1], pad_left=True, pad_right=True, perItem=True), tup[1]) for tup in list_of_not_English_tweets_and_entropies]
    tweets = [tup[1] for tup in list_of_not_English_tweets_and_entropies]
    list_of_not_English_tweets_and_entropies = []
    for tweet in tweets:
        e = np.mean([bigram_model.entropy(w, pad_left=True, pad_right=True, perItem=True) for w in tweet])
        list_of_not_English_tweets_and_entropies.append((e, tweet))

    # Sort the new list of (entropy,tweet) tuples
    list_of_not_English_tweets_and_entropies = sorted(list_of_not_English_tweets_and_entropies, key=lambda tup: tup[0])

    # Return the list of tweets with _new_ entropies, re-sorted
    return list_of_not_English_tweets_and_entropies


##### Answers #####

def ppEandT(eAndTs):
    '''Pretty print a list of entropy+tweet pairs

    :type eAndTs: list(tuple(float,list(str)))
    :param eAndTs: entropies and tweets
    :return: None'''

    for entropy,tweet in eAndTs:
        print (u"%.3f {%s}"%(entropy,", ".join(tweet))).encode("utf-8")

In [15]:
### Question 7
print "*** Question 7: building brown bigram letter model ***"
brown_bigram_model = q7(brown) ########################################
### Question 8
print "*** Question 8 ***"
answer8 = q8("20100128.txt",brown_bigram_model)
print "Best 10 entropies:"
ppEandT(answer8[:10])
print "Worst 10 entropies:"
ppEandT(answer8[-10:])
### Question 9
print "*** Question 9 ***"
answer9 = q9()
print answer9
### Question 10
print "*** Question 10 ***"
answer10 = q10(answer8)
print "Mean: %s"%answer10[0]
print "Standard Deviation: %s"%answer10[1]
print "=================="
print "'Ascii' tweets: Best 10 entropies:"
ppEandT(answer10[2][:10])
print "=================="
print "'Ascii' tweets: Worst 10 entropies:"
ppEandT(answer10[2][-10:])
print "=================="
print "Probably not English tweets: Best 10 entropies:"
ppEandT(answer10[3][:10])
print "=================="
print "Probably not English tweets: Worst 10 entropies:"
ppEandT(answer10[3][-10:])
### Question 11
print "*** Question 11 ***"
list_of_not_English_tweets_and_entropies = answer10[3]
answer11 = q11(["esp.test","esp.train"],list_of_not_English_tweets_and_entropies)
print "Best 10 entropies:"
ppEandT(answer11[:10])
print "Worst 10 entropies:"
ppEandT(answer11[-10:])

*** Question 7: building brown bigram letter model ***
*** Question 8 ***
Best 10 entropies:
2.492 {and, here, is, proof, the}
2.539 {and, bailed, he, here, is, man, on, that, the}
2.558 {is, the, this, weather, worst}
2.569 {s, s, s, s, s, s, s, s, s, s}
2.570 {be, bus, here, the, to, want}
2.577 {hell, that, the, was, wat}
2.588 {creation, is, of, on, story, the, the}
2.589 {fro, one, the, the, with}
2.595 {is, money, motive, the, the}
2.618 {at, bucks, end, lead, of, the, the, the}
Worst 10 entropies:
17.524 {作品によっては怪人でありながらヒーロー, あるいはその逆, というシチュエーションも多々ありますが, そうした事がやれるのもやはり怪人とヒーローと言うカテゴリが完成しているからだと思うんですよね, あれだけのバリエーションがありながららしさを失わないデザインにはまさに感服です}
17.525 {ロンブーの淳さんはスピリチュアルスポット, セドナーで瞑想を実践してた, これらは偶然ではなく必然的に起こっている, 自然は全て絶好のタイミングで教えてくれている, そして今が今年最大の大改革時期だ}
17.526 {実物経済と金融との乖離を際限なく広げる, レバレッジが金融で儲けるコツだと, まるで正義のように叫ぶ連中が多いけど, これほど不健全な金融常識はないと思う, 連中は不健全と知りながら, 他の奴がやるから出し抜かれる前に出し抜くのが道理と言わんばかりに群がる}
17.528 {一応ワンセット揃えてみたんだけど, イマイチ効果を感じないのよね, それよりはオーラソーマとか, 肉体に直接働きかけるタイプのアプローチの方が効き目を感じ取りやすい, 波動系