/
prediction.py
90 lines (67 loc) · 5.17 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from keras.preprocessing import text, sequence
from keras.models import load_model
import pandas as pd
import string
import re
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
stopwords = ['a','about','above','after','again','against','ain','all','am','an','and','any','are','aren',"aren't",'as','at','be','because','been','before','being','below','between','both','but','by','can','couldn',"couldn't",'d','did','didn',"didn't",'do','does','doesn',"doesn't",'doing','don',"don't",'down','during','each','few','for','from','further','had','hadn',"hadn't",'has','hasn',"hasn't",'have','haven',"haven't",'having','he','her','here','hers','herself','him','himself','his','how','i','if','in','into','is','isn',"isn't",'it',"it's",'its','itself','just','ll','m','ma','me','mightn',"mightn't",'more','most','mustn',"mustn't",'my','myself','needn',"needn't",'no','nor','not','now','o','of','off','on','once','only','or','other','our','ours','ourselves','out','over','own','re','s','same','shan',"shan't",'she',"she's",'should',"should've",'shouldn',"shouldn't",'so','some','such','t','than','that',"that'll",'the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','ve','very','was','wasn',"wasn't",'we','were','weren',"weren't",'what','when','where','which','while','who','whom','why','will','with','won',"won't",'wouldn',"wouldn't",'y','you',"you'd","you'll","you're","you've",'your','yours','yourself','yourselves']
APPO = {"aren't" : "are not","can't" : "cannot","couldn't" : "could not","didn't" : "did not","doesn't" : "does not","don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","he'd" : "he would","he'll" : "he will","he's" : "he is","i'd" : "I would","i'd" : "I had","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will","i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not","she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","that's" : "that is","there's" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","they've" : "they have","we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not","you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not","we'll":" will","didn't": "did not","tryin'":"trying"}
def clean(comment):
"""
This function receives comments and returns clean word-list
"""
# Convert to lower case , so that Hi and hi are the same
comment=comment.lower()
# remove \n
comment=re.sub("\\n"," ",comment)
# remove leaky elements like ip,user
comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
# removing usernames
comment=re.sub("\[\[.*\]","",comment)
# Split the sentences into words
words=tokenizer.tokenize(comment)
# (')aphostophe replacement (ie) you're --> you are
words=[APPO[word] if word in APPO else word for word in words]
words = [w for w in words if not w in stopwords]
clean_sent = " ".join(words)
return clean_sent
def clean_with_stops(comment):
"""
This function receives comments and returns clean word-list without removing stopwords
"""
# Convert to lower case , so that Hi and hi are the same
comment=comment.lower()
# remove \n
comment=re.sub("\\n"," ",comment)
# remove leaky elements like ip,user
comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
# removing usernames
comment=re.sub("\[\[.*\]","",comment)
# Split the sentences into words
words=tokenizer.tokenize(comment)
# (')aphostophe replacement (ie) you're --> you are
words=[APPO[word] if word in APPO else word for word in words]
#words = [w for w in words if not w in stopwords]
clean_sent = " ".join(words)
return clean_sent
#test = pd.read_csv('data/test.csv')
#test = test.apply(lambda x: clean(x))
#test.fillna(' ', inplace=True)
#test = test.str.lower()
test = pd.read_csv('data/clean_test.csv')
test['comment_text'].fillna(' ', inplace=True)
test = test['comment_text'].str.lower()
train = pd.read_csv('data/clean_train.csv')
train['comment_text'].fillna(' ', inplace=True)
train_x = train['comment_text'].str.lower()
# Vectorize text + Embedding
tokenizer = text.Tokenizer(num_words=100000, lower=True)
tokenizer.fit_on_texts(train_x.values)
test = tokenizer.texts_to_sequences(test)
test = sequence.pad_sequences(test, maxlen=150)
model = load_model('lstm_fasttext_epoch_5_150d.h5')
predictions = model.predict(test, batch_size=32, verbose=1)
output = pd.read_csv('data/sample_submission.csv')
output[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
output.to_csv('output.csv', index=False)