In [23]:
import pandas as pd

spooky = pd.read_csv('spooky.csv')
text = spooky['text']
text.head()

0    This process, however, afforded me no means of...
1    It never once occurred to me that the fumbling...
2    In his left hand was a gold snuff box, from wh...
3    How lovely is spring As we looked from Windsor...
4    Finding nothing else, not even gold, the Super...
Name: text, dtype: object

In [5]:
'2'.isalpha(), 'a'.isalpha(), ','.isalpha()

(False, True, False)

In [10]:
import numpy as np
df = pd.DataFrame(data=np.arange(20).reshape(10,2), columns=['col1', 'col2'])
df

Unnamed: 0,col1,col2
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11
6,12,13
7,14,15
8,16,17
9,18,19


In [12]:
df['col2'].apply(lambda x: x**2)

0      1
1      9
2     25
3     49
4     81
5    121
6    169
7    225
8    289
9    361
Name: col2, dtype: int64

由于 `str.split()` 每次只能按某个特定的字符分割, 所以我没想把非字母字符替换成空格

In [6]:
toreplace = set()
for string in text:
    for char in string:
        if not (char.isalpha() or char == ' '):
            toreplace.add(char)
toreplace

{'"', "'", ',', '.', ':', ';', '?'}

In [24]:
def replace_all(string, toreplace, replace_to):
    for char in toreplace:
        string = string.replace(char, replace_to)
    return string

text = text.apply(lambda x: replace_all(x, toreplace, ' ').split())
text.head()

0    [This, process, however, afforded, me, no, mea...
1    [It, never, once, occurred, to, me, that, the,...
2    [In, his, left, hand, was, a, gold, snuff, box...
3    [How, lovely, is, spring, As, we, looked, from...
4    [Finding, nothing, else, not, even, gold, the,...
Name: text, dtype: object

In [29]:
words = set()
for line in text:
    for word in line:
        words.add(word.lower())
words = list(words)
len(words)

25095

In [35]:
def word_count(li):
    count = {}
    for word in li:
        if word in count.keys():
            count[word] += 1
        else:
            count[word] = 1
    return count
            
text_freq = text.apply(lambda x: word_count(x))
text_freq.head()

0    {'seemed': 1, 'perfectly': 1, 'wall': 1, 'to':...
1    {'never': 1, 'It': 1, 'mistake': 1, 'to': 1, '...
2    {'hill': 1, 'all': 1, 'self': 1, 'steps': 1, '...
3    {'spread': 1, 'towns': 1, 'lovely': 1, 'former...
4    {'occasionally': 1, 'abandoned': 1, 'countenan...
Name: text, dtype: object

`X` represents word vectors

In [None]:
X = pd.DataFrame(data=0, index=text.index, columns=words)
X.head()

In [None]:
for row_i in X.index:
    count = text_freq.loc[row_i]
    for word, counts in count.items():
        X.loc[row_i, word] = counts
        
X.head()

`str.split()` 与 `str.split(' ')`

In [13]:
'a   b c'.split()

['a', 'b', 'c']

In [14]:
'a   b c'.split(' ')

['a', '', '', 'b', 'c']

In [63]:
stopwords = np.ravel(pd.read_csv('stopwords.csv').values)
stopwords

array(['hers', 'between', 'yourself', 'but', 'again', 'there', 'about',
       'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own',
       'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
       'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or',
       'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until',
       'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don',
       'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down',
       'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to',
       'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before',
       'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
       'yourselves', 'then', 'that', 'because', 'what', 'over', 'why',
       'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself',
       'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those',
       'i', 'after', 'few', 'whom', 't', 'being', 'if', 'their