# MULTICLASS SUPERVISE LEARNING

In [1]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.layers import Flatten, LSTM
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input

import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
import string
import nltk
from nltk.tokenize import word_tokenize

In [4]:
from numpy import array
from numpy import asarray
from numpy import zeros

# 1. Get data

In [50]:
df = pd.read_csv("C:/Users/raymo/code/pin-lpt/uncancelled_master/data/wiki_edit_content_contro_cat.csv")

In [51]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [52]:
df.head()

Unnamed: 0,page_title,content,controversy,rank,total,month,clean_text,category
0,"""Polish death camp"" controversy","""Polish death camp"" and ""Polish concentration ...",1,29,529,2/1/2018,polish death camp polish concentr camp controv...,Politics
1,'Ubadah ibn al-Samit,`Ubadah ibn al-Samit (Arabic: عبادة بن الصامت‎...,1,79,372,1/1/2020,companion chieftain ansar confeder battl offic...,Other
2,116th United States Congress,The 116th United States Congress is the curren...,1,17,651,11/1/2018,unit current meet legisl branch unit feder gov...,Politics
3,13 Reasons Why,13 Reasons Why (stylized onscreen as TH1RTEEN ...,1,22,641,4/1/2017,teen drama web televis seri thirteen seri high...,Entertainment
4,1721 Boston smallpox outbreak,"In 1721, Boston experienced its worst outbreak...",1,57,360,6/1/2019,boston outbreak smallpox variola outbreak moti...,Pandemic


In [53]:
data = df.copy()

# 2. Preprocessing

#### 2.1 encoding

In [62]:
data["category"].unique()

array(['Politics', 'Other', 'Entertainment', 'Pandemic', 'Sports',
       'Weather', 'Terrorism', 'Tech'], dtype=object)

In [65]:
encoder = OneHotEncoder(sparse=False)
encoder.fit(df[['category']])
cat_oh = encoder.transform(df[['category']])
encoder.categories_

[array(['Entertainment', 'Other', 'Pandemic', 'Politics', 'Sports', 'Tech',
        'Terrorism', 'Weather'], dtype=object)]

In [66]:
data['cat_entertainment'], data['cat_other'],data['cat_pandemic'],data['cat_politics'],data['cat_sports'],data['cat_tech'],data['cat_terrorism'],data['cat_weather'] = cat_oh.T

In [67]:
data.head()

Unnamed: 0,page_title,content,controversy,rank,total,month,clean_text,category,cat_entertainment,cat_other,cat_pandemic,cat_politics,cat_sports,cat_tech,cat_terrorism,cat_weather
0,"""Polish death camp"" controversy","""Polish death camp"" and ""Polish concentration ...",1,29,529,2/1/2018,polish death camp polish concentr camp controv...,Politics,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,'Ubadah ibn al-Samit,`Ubadah ibn al-Samit (Arabic: عبادة بن الصامت‎...,1,79,372,1/1/2020,companion chieftain ansar confeder battl offic...,Other,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,116th United States Congress,The 116th United States Congress is the curren...,1,17,651,11/1/2018,unit current meet legisl branch unit feder gov...,Politics,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,13 Reasons Why,13 Reasons Why (stylized onscreen as TH1RTEEN ...,1,22,641,4/1/2017,teen drama web televis seri thirteen seri high...,Entertainment,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1721 Boston smallpox outbreak,"In 1721, Boston experienced its worst outbreak...",1,57,360,6/1/2019,boston outbreak smallpox variola outbreak moti...,Pandemic,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### 2.2 remove punctuation

In [68]:
# remove punctuation
def rev_punc(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
    return text

In [69]:
data["text"] = data["text"].apply(rev_punc)

KeyError: 'text'

In [None]:
data["text"].head()

#### 2.3 lower case

In [4]:
# lower case
def cre_low(text):
    word_tokens = word_tokenize(text)
    text = [w.lower() for w in word_tokens]
    return ' '.join(text)

In [None]:
data["text"] = data["text"].apply(cre_low)

In [None]:
data["text"].head()

#### 2.4 remove number

In [None]:
# remove number
def rev_num(text):
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w.isdigit()]
    return ' '.join(text)

In [None]:
data["text"] = data["text"].apply(rev_num)

In [None]:
data["text"].head()

#### 2.5 remove not english text

In [None]:
#remove not english
def rev_not_en(text):
    words = set(nltk.corpus.words.words())
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if w in words]
    return ' '.join(text)

In [None]:
data["text"] = data["text"].apply(rev_not_en)

In [None]:
data["text"].head()

#### 2.6 remove stop word

In [None]:
# remove stopword
def rev_stop(text):
    with open('english_stopwords.txt', 'r') as f:
        stop_words = [line.strip() for line in f]
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(text)

In [None]:
data["text"] = data["text"].apply(rev_stop)

In [None]:
data["text"].head()

#### 2.7 Stemmtize

In [None]:
# stem
def word_lem(text):
    stemmer = SnowballStemmer("english")
    word_tokens = word_tokenize(text)
    text = [stemmer.stem(w) for w in word_tokens]
    return ' '.join(text)

In [None]:
data["text"] = data["text"].apply(word_lem)

In [None]:
data["text"].head()

#### 2.8 Drop row which doesn't contain text

In [None]:
final_data = data.drop(data[data["topic"]==""].index)

# 3. Get train and test data

#### 3.1 Get X and y

In [70]:
X = data["clean_text"]
y = data[['cat_entertainment', 'cat_other','cat_pandemic','cat_politics','cat_sports','cat_tech','cat_terrorism','cat_weather']]

#### 3.2 train test split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 4. Embedding the text

In [73]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 500

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [5]:
embeddings_dictionary = dict()

glove_file = open('/content/drive/My Drive/Colab Datasets/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

'wget' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


# 5. MODELING

In [6]:
def create_model(maxlen, vocab_size, embedding_matrix):
    deep_inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
    LSTM_Layer_1 = LSTM(128)(embedding_layer)
    dense_layer_1 = Dense(6, activation='sigmoid')(LSTM_Layer_1)
    
    model = Model(inputs=deep_inputs, outputs=dense_layer_1)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

SyntaxError: unexpected EOF while parsing (<ipython-input-6-3d8654a2041a>, line 1)

In [None]:
model = create_model()

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

# 6. Evaluate

In [None]:
results = model.evaluate(X_train, y_train, verbose=1)
print('Train loss: {} - Train accuracy (MAE): {}'.format(results[0], results[1]))