In [74]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tqdm import tqdm
import torch.nn as nn

In [75]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

# **2. Data cleanning** 

In [76]:
def remove_URL(text):
    text_new = re.sub(r'https?://\S+|www\.\S+', r'', text)
    # url = re.compile(r'https?://\S+|www\.\S+')
    return text_new

train_data['text'] = train_data['text'].apply(lambda x : remove_URL(x))
# print(train_data['text'])

In [77]:
def remove_html(text):
    # html = re.compile(r'<.*?>')
    text_new = re.sub(r'<.*?>', r'', text)
    return text_new

train_data['text'] = train_data['text'].apply(lambda x : remove_html(x))

In [78]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

train_data['text'] = train_data['text'].apply(lambda x : remove_punct(x))

In [79]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

train_data['text'] = train_data['text'].apply(lambda x: remove_emoji(x))

# **3. Data preprocessing**

In [80]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()              
tokenizer.fit_on_texts(train_data.text)   
word_index = tokenizer.word_index 

In [81]:
# print('Number of unique words:',len(word_index))

In [82]:
training_sequences = tokenizer.texts_to_sequences(train_data.text)  
MAX_LEN=20    
training_padded = pad_sequences(training_sequences, 
                                   maxlen=MAX_LEN,          
                                   padding='post',       
                                   truncating='post')      

In [83]:
embedding_dict={}

with open('./glove.6B.100d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [84]:
num_words = len(word_index)+1
embedding_dim = 100
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in tqdm(word_index.items()):
    if i < num_words:
        embedding_vector = embedding_dict.get(word)  
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_matrix.shape

100%|██████████| 18104/18104 [00:00<00:00, 457804.12it/s]


(18105, 100)

In [85]:
print(training_padded)

[[ 109 4493   20 ...    0    0    0]
 [ 179   41  218 ...    0    0    0]
 [  38 1694 1570 ...    3  651 1351]
 ...
 [3301 4485 6707 ...    0    0    0]
 [  75 1102   37 ... 2563  296    0]
 [   1  199   51 ...    0    0    0]]


## **4. Build my model**

In [86]:
from keras.models import Sequential 
from keras.layers import Embedding,Dense,Dropout,LSTM
from keras import optimizers,initializers

In [87]:
def create_model():
    model = Sequential() 
    model.add(Embedding(input_dim=num_words,
                        output_dim=100,
                        embeddings_initializer=initializers.Constant(embedding_matrix), 
                        input_length=MAX_LEN,trainable=False))
    model.add(Dropout(0.2)) 
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(128, activation='sigmoid')) 
    model.add(Dropout(0.2)) 
    model.add(Dense(64, activation='sigmoid')) 
    model.add(Dropout(0.2)) 
    model.add(Dense(1, activation='sigmoid')) 
    model.compile(loss='binary_crossentropy',  
                  optimizer='adam',              
                  metrics=['accuracy'])          
    return model

In [88]:
model=create_model()
model.summary()

Model: "sequential_246"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_246 (Embedding)    (None, 20, 100)           1810500   
_________________________________________________________________
dropout_410 (Dropout)        (None, 20, 100)           0         
_________________________________________________________________
lstm_246 (LSTM)              (None, 64)                42240     
_________________________________________________________________
dense_574 (Dense)            (None, 128)               8320      
_________________________________________________________________
dropout_411 (Dropout)        (None, 128)               0         
_________________________________________________________________
dense_575 (Dense)            (None, 64)                8256      
_________________________________________________________________
dropout_412 (Dropout)        (None, 64)             

In [92]:
from keras.wrappers.scikit_learn import KerasClassifier 
from sklearn.model_selection import GridSearchCV 
model = KerasClassifier(build_fn=create_model, verbose=0) 
batch_size = [5, 10, 20, 50, 100] 
epochs = [5, 10, 15, 20, 35, 50] 
param_grid = dict(batch_size=batch_size, nb_epoch=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=1)

In [93]:
grid_result = grid.fit(training_padded, train_data['target'].values)
results= pd.DataFrame(grid_result.cv_results_)

In [94]:
print("Cross accuracy：\n", grid_result.best_score_)
print("best parameter\n", grid_result.best_params_)

Cross accuracy：
 0.7928552389144897
best parameter
 {'batch_size': 20, 'nb_epoch': 35}


# **5. Baeline models**

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_padded, train_data['target'].values, test_size=0.2)

In [96]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm,neighbors,neural_network,naive_bayes, ensemble
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import *


model1= svm.SVC()
model1.fit(X_train,y_train)
predict1=model1.predict(X_test)

model2=neighbors.KNeighborsClassifier()
model2.fit(X_train,y_train)

model3=neural_network.MLPClassifier(max_iter=1000)
model3.fit(X_train,y_train)

model4=naive_bayes.GaussianNB()
model4.fit(X_train,y_train)

model5=ensemble.RandomForestClassifier()
model5.fit(X_train,y_train)

model6=ensemble.GradientBoostingClassifier()
model6.fit(X_train,y_train)

print('Model 1 Accuracy:',model1.score(X_test, y_test))
print('Model 2 Accuracy:',model2.score(X_test, y_test))
print('Model 3 Accuracy:',model3.score(X_test, y_test))
print('Model 4 Accuracy:',model4.score(X_test, y_test))
print('Model 5 Accuracy:',model5.score(X_test, y_test))
print('Model 6 Accuracy:',model6.score(X_test, y_test))

Model 1 Accuracy: 0.5673013788575181
Model 2 Accuracy: 0.582403151674327
Model 3 Accuracy: 0.5384110308601444
Model 4 Accuracy: 0.5331582403151675
Model 5 Accuracy: 0.6815495732107683
Model 6 Accuracy: 0.6631648063033486


In [97]:
submission = {}
testing_sequences = tokenizer.texts_to_sequences(test_data.text)
testing_padded = pad_sequences(testing_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
predictions = grid_result.predict(testing_padded)
# submission = (predictions > 0.5).astype(int)
test_data['target'] = pd.Series(predictions.reshape(1, -1)[0])
test_data['target']

0       1
1       1
2       1
3       1
4       1
       ..
3258    1
3259    1
3260    0
3261    0
3262    0
Name: target, Length: 3263, dtype: int64

In [98]:
submission = pd.concat([test_data['id'], test_data['target']], axis=1)
submission.to_csv("submission.csv", index=False, header=True)

In [99]:
print(submission)

         id  target
0         0       1
1         2       1
2         3       1
3         9       1
4        11       1
...     ...     ...
3258  10861       1
3259  10865       1
3260  10868       0
3261  10874       0
3262  10875       0

[3263 rows x 2 columns]
