#### First I used Sublime Text to re-encode the text file as "utf-8".

In [16]:
import re
import collections
import numpy as np
import pandas as pd

with open('LSA.txt', 'r') as f:
    data = f.read().lower()
    clean_data = re.sub(r'[,"./()?:!]', '', data)
    result = {}
    for line in clean_data.splitlines():
        if not line:
            continue  # skip blank lines
        elif line.startswith('xxx'):
            doc_num = 'tf{}'.format(line[3:])  # skip the "XXX" and name the docs as "tf1 - tf11"
        else:
            result[doc_num] = collections.Counter(line.split())
 
    list(result.keys())

for k, v in list(result['tf4'].items()):  # select any dicts from result to see if it is correct
    print("'{}': {}".format(k, v))
    
df = pd.DataFrame(result, columns = list(result.keys()))
df = df.fillna(0).astype(int)  # change NaNs to 0 and only take integers

print(df)  # see the TD - Term Frequency data frame we get

df.to_csv('tdm.txt', index=True, sep='\t', header=True, mode = 'a')  # output as text file

np_df = df.as_matrix()  # if we wanna turn it into a numpy matrix
print(np_df)

def returnTermFrequency (term, document):  #  a program of "returnTermFrequency"
    TermFrequency = df.loc[str(term), str(document)]
    print(TermFrequency)
    
returnTermFrequency ('will', 'tf1')

'in': 6
'the': 24
'first': 1
'week's': 1
'class': 7
'we': 7
'introduce': 1
'ourselves': 1
'and': 17
'reason': 1
'of': 13
'picking': 1
'this': 4
'after': 3
'that': 5
'dr': 2
'rashed': 2
'iqbal': 2
'introduced': 1
'main': 1
'contents': 1
'i': 9
'felt': 1
'really': 5
'interested': 2
'job': 1
'market': 2
'how': 1
'to': 5
'become': 2
'a': 7
'data': 7
'scientist': 2
'also': 1
'found': 3
'examples': 1
'case': 1
'studies': 1
'would': 2
'be': 2
'interesting': 3
'help': 2
'me': 1
'lot': 2
'learn': 1
'python': 2
'then': 1
'told': 1
'us': 2
'something': 1
'about': 2
'for': 2
'science': 4
'professionals': 1
'he': 2
'gave': 1
'brief': 2
'introduction': 3
'morgan': 1
'stanley': 1
'ironwood': 1
'airbnb': 1
'discussed': 2
'relationship': 1
'between': 3
'economy': 1
'regard': 1
'demand': 1
'as': 3
'most': 1
'part': 1
'keep': 1
'believing': 1
'truth': 2
'world': 1
'our': 2
'society': 2
'is': 1
'behind': 1
'which': 1
'can': 3
'hardly': 1
'measure': 1
'understand': 2
'before': 2
'use': 1
'analyze': 1
'prog

In [15]:
import math

term_idf = df.astype(bool).sum(axis=1)  # get the number of which frequency is not 0 in every row

idf = pd.DataFrame({k : [math.log(11/v)] for (k,v) in term_idf.items()})  # build the dataframe
idf = idf.T  # switch column and rows
idf.columns = ['idf']  # remane column as "idf"
print(idf)

tfidf = pd.DataFrame(df.values*idf.values, columns=df.columns, index=df.index)  # build TF-IDF data frame
print(tfidf)

tfidf.to_csv('tfidf.txt', index=True, sep='\t', header=True, mode = 'a')  # output as text file

def returnTFIDF (term, document):  #  a program of "returnTFIDF"
    TFIDF = tfidf.loc[str(term), str(document)]
    print(TFIDF)
    
returnTFIDF ('will', 'tf1')

                 idf
&           2.397895
-           1.299283
1           1.704748
10          2.397895
15          1.704748
190000      1.704748
2           2.397895
2018        1.704748
21st        2.397895
25          2.397895
3           2.397895
50          2.397895
a           0.000000
abandon     2.397895
abilities   2.397895
about       0.318454
academic    2.397895
accomplish  2.397895
according   1.704748
achieve     2.397895
acquire     1.704748
actionable  1.704748
activities  2.397895
activity    0.788457
actual      1.704748
actually    1.011601
addition    2.397895
ads         2.397895
advising    2.397895
affected    2.397895
...              ...
weeks       1.704748
weigh       2.397895
well        1.011601
went        2.397895
were        0.606136
what        0.318454
what's      1.704748
when        1.299283
where       2.397895
whether     2.397895
which       0.451985
while       1.704748
who         0.451985
why         1.011601
wide        2.397895
will        0

##### Extra Credit!

In [11]:
with open('LSA.txt', 'r') as f:
    data = f.read().lower()
    clean_data = re.sub(r'[,"./()?:!]', '', data)

blacklist = ["as", "is", "im", "or", "they", "those", "what", "this", "that", "can"]  # Blacklist of words to be filtered out
for word in blacklist:
    clean_data = clean_data.replace(word, "")
    
    result = {}
    for line in clean_data.splitlines():
        if not line:
            continue  # skip blank lines
        elif line.startswith('xxx'):
            doc_num = 'tf{}'.format(line[3:])  # skip the "XXX" and name the docs as "tf1 - tf11"
        else:
            result[doc_num] = collections.Counter(line.split())
 
    list(result.keys())

for k, v in list(result['tf4'].items()):  # select any dicts from result to see if it is correct
    print("'{}': {}".format(k, v))
    
df = pd.DataFrame(result, columns = list(result.keys()))
df = df.fillna(0).astype(int)

print(df)  # see the TD - Term Frequency data frame we get

# df.to_csv('tdm.txt', index=True, sep='\t', header=True, mode = 'a')  # output as text file

np_df = df.as_matrix()  # if we wanna turn it into a numpy matrix
print(np_df)

def returnTermFrequency (term, document):  #  a program of "returnTermFrequency"
    TermFrequency = df.loc[str(term), str(document)]
    print(TermFrequency)
    
returnTermFrequency ('will', 'tf1')

'in': 6
'the': 24
'first': 1
'week's': 1
'cls': 7
'we': 7
'introduce': 1
'ourselves': 1
'and': 17
'reon': 1
'of': 13
'picking': 1
'th': 4
'after': 3
'dr': 2
'rhed': 2
'iqbal': 2
'introduced': 1
'main': 1
'contents': 1
'i': 9
'felt': 1
'really': 5
'interested': 2
'job': 1
'market': 2
'how': 1
'to': 5
'become': 2
'a': 7
'data': 7
'scientt': 2
'also': 1
'found': 3
'examples': 1
'ce': 1
'studies': 1
'would': 2
'be': 2
'interesting': 3
'help': 2
'me': 1
'lot': 2
'learn': 1
'python': 2
'then': 1
'told': 1
'us': 2
'something': 1
'about': 2
'f': 2
'science': 4
'professionals': 1
'he': 2
'gave': 1
'brief': 2
'introduction': 3
'mgan': 1
'stanley': 1
'ironwood': 1
'airbnb': 1
'dcussed': 2
'relationship': 1
'between': 3
'economy': 1
'regard': 1
'demand': 1
'most': 1
'part': 1
'keep': 1
'believing': 1
'truth': 2
'wld': 1
'our': 2
'society': 2
'behind': 1
'which': 1
'hardly': 1
'meure': 1
'understand': 2
'befe': 2
'use': 1
'analyze': 1
'programs': 1
'with': 3
'r': 1
'other': 1
'computer': 2
'languag

In [3]:
lsa_Wang.ipynb Download as python(.py)

SyntaxError: invalid syntax (<ipython-input-3-ffd2ea14255c>, line 1)