/
PRE720.py
103 lines (85 loc) · 3.17 KB
/
PRE720.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from corpus import filtered_corpus
from gensim import corpora, models, similarities
import nltk
from collections import Counter
import re
import csv
import numpy as np
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pdb
def preprocess_docs():
stopwords = nltk.corpus.stopwords.words('english')
for train, topic, title, text in filtered_corpus():
text = [i for i in nltk.word_tokenize(text) if i.lower() not in stopwords]
title = [i for i in nltk.word_tokenize(title) if i.lower() not in stopwords]
yield train, topic, text + title
def get_features(mode, attributes=None, topics=None):
assert mode in ["TRAIN", "TEST"]
# Read preprocessed text
corpus = [(topic, text) for train, topic, text in preprocess_docs() if train == mode]
# Create attributes
if attributes is None:
attributes = set([])
attr_counter = Counter([])
for topic, title in corpus:
for word in title:
attr_counter.update([word])
most_common = set([word for word, count in attr_counter.most_common(50)])
for topic, title in corpus:
for word in title:
regexp = re.compile(r'[^a-zA-Z]')
if regexp.search(word) is not None:
continue
if attr_counter[word] < 3:
continue
if word in most_common:
continue
attributes.add(word)
# Construct a columnar mapping
attributes = sorted(attributes)
attributes_dict = {}
counter = 0
for a in attributes:
attributes_dict[a] = counter
counter += 1
attributes = attributes_dict
# Construct binary matrix
X = np.zeros((len(corpus), len(attributes)), dtype='int')
Y = [0 for _ in range(len(corpus))]
if topics is None:
topics = set([])
for topic, _ in corpus:
topics.add(topic)
# Construct a topic mapping
topics = dict([(j, i) for i, j in enumerate(topics)])
#metrics.classification_report
# Build the features matrix
for row, (topic, title) in enumerate(corpus):
if topic not in topics:
pdb.set_trace()
continue
for word in title:
if word not in attributes:
continue
offset = attributes[word]
X[row][offset] = 1
Y[row] = topics[topic]
return X, Y, attributes, topics
if __name__ == "__main__":
print "Creating input..."
Xtrain, Ytrain, attributes, topics = get_features("TRAIN")
Xtest, Ytest, _, _ = get_features("TEST", attributes, topics)
print "Testing..."
for clf in [LinearSVC]:
print "**classification_report**"
print clf
clf = clf()
clf.fit(Xtrain, Ytrain)
Ypred = clf.predict(Xtest)
print metrics.accuracy_score(Ytest, Ypred)
print metrics.confusion_matrix(Ytest, Ypred)
print metrics.classification_report(Ytest, Ypred, topics.values(), topics.keys())