#### Ze względu na znaczące trudności w doprowadzeniu do końca oraz otrzymania jakkkolwiek wartościowych wyników w poprzedniej wersji projektu (rozpoznawanie mowy - temat numer 5 z PolEval 2019) postanowiłem zmienić temat na analizę prześladowań w internecie we wpisach na portalu tweeter (temat numer 6 PolEval 2019) ze względu na to, że już mam doświadczenie w podobnych zadaniach dzięki czemu jestem w stanie bardziej świadomie przeprowadzić proces oraz prawidłowo zinterpretować wyniki

In [1]:
import wget
import os
import zipfile
import string
import spacy
import numpy as np
from stop_words import get_stop_words
import re
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import TextVectorization, Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score
import keras


zip_file_name = 'task_6-1.zip'
tweets_contents_file = 'training_set_clean_only_text.txt'
tweets_labels_file = 'training_set_clean_only_tags.txt'
url = 'https://raw.githubusercontent.com/PatrycyD/INL_2/master/task_6-1.zip'
if not os.path.isfile(zip_file_name):
    wget.download(url) 

def extract_from_zip(zip_file, content_to_extract):
    with zipfile.ZipFile(zip_file, encoding='utf-8') as z:
        with open(content_to_extract, 'wb', encoding='utf-8') as f:
            f.write(z.read(content_to_extract))
            print('Extracted', content_to_extract)
            f.close()
        z.close()
        
def load_to_variable(file_to_load, data_type):
    if data_type == 'np.array':
        contents = np.array([]).reshape(1, -1)
    else:
        contents = []
    with open(file_to_load, 'r', encoding='utf-8') as file:
        for row in file:
            if data_type == 'np.array':
                contents = np.append(contents, row)
            else:
                contents.append(row)
    file.close()
    return contents


if not os.path.isfile(tweets_labels_file):
    extract_from_zip(zip_file_name, tweets_labels_file)

if not os.path.isfile(tweets_contents_file):
    extract_from_zip(zip_file_name, tweets_contents_file)
    
labels = load_to_variable(tweets_labels_file, 'np.array')
labels = labels.astype(np.float32)
tweets = load_to_variable(tweets_contents_file, 'list')
# tweets = tweets[:10]

In [2]:
stop_words = get_stop_words('polish')
def remove_stop_words(text):
    return ''.join([word for word in text if word not in stop_words])

def clean_URLs(text):
    return re.sub(r'http\S+', '', text)

def remove_nicknames(text):
    return text.replace('@anonymized_account', '').strip().strip('\n')

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
def remove_emojis(text):
    return emoji_pattern.sub(r'', text)

def remove_non_ascii_chars(text):
    return re.sub(r'[^\x00-\x7f]',r'', text) 

punctuations_list = string.punctuation
def clean_punctuation(text):
    translator = str.maketrans('', '', punctuations_list) # jest to mapowanie i zamiana, dwa pierwsze argumenty to dwa stringi, gdzie znaki w pierwszym są zamieniane na znaki w drugim stringu, zgodnie z indeksem, Trzeci argument to znaki, które są mapowane do None => zostaną po prostu usunięte
    return text.translate(translator)

def clean_repeating_chars(text):
    return re.sub(r'(.)1+', r'1', text)

def clean_numbers(text):
    return re.sub('[0-9]+', '', text)

# !pip install --upgrade spacy
# !python -m spacy download pl_core_news_sm
nlp = spacy.load('pl_core_news_sm')

def lemmatize(text):
    doc = nlp(text)
    lemmatized_sentence = ''
    for token in doc:
        lemmatized_sentence = f'{lemmatized_sentence} {token.lemma_}'
        
    return lemmatized_sentence.strip()

print('Removing stop words')
tweets = [remove_stop_words(tweet) for tweet in tweets]
print('Cleaning URLs')
tweets = [clean_URLs(tweet) for tweet in tweets]
print('Removing nicknames')
tweets = [remove_nicknames(tweet) for tweet in tweets]
print('Removing emojis')
tweets = [remove_emojis(tweet) for tweet in tweets]
print('Removing non ascii characters')
tweets = [remove_non_ascii_chars(tweet) for tweet in tweets]
print('Removing punctuation')
tweets = [clean_punctuation(tweet) for tweet in tweets] 
print('Removing repeating characters')
tweets = [clean_repeating_chars(tweet) for tweet in tweets]
print('Removing numbers')
tweets = [clean_numbers(tweet) for tweet in tweets]
print('Lemmatizing')
tweets = [lemmatize(tweet) for tweet in tweets]

Removing stop words
Cleaning URLs
Removing nicknames
Removing emojis
Removing non ascii characters
Removing punctuation
Removing repeating characters
Removing numbers
Lemmatizing


### Jeżeli tutaj pojawi się błąd spacy związany z pobieraniem paczki polskiej proszę o zresetowanie środowiska i uruchomienie wszystkiego od nowa - paczka może pobrać się prawidłowo po zaktualizowaniu spacy i zresetowaniu środowiska

In [3]:
embedding_file = 'pl-embeddings-cbow.txt'

if not os.path.isfile(embedding_file):
    url = 'http://publications.ics.p.lodz.pl/2016/word_embeddings/pl-embeddings-cbow.txt'
    wget.download(url)
    
embeddings_index = {}
with open(embedding_file, encoding='utf-8') as file:
    for line in file:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs
        
embeddings_index.pop('933198') #na początku pliku jest notatka o liczbie wektorów i liczbie punktów w wektorach

print("Found %s word vectors." % len(embeddings_index))

Found 933198 word vectors.


In [4]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorizer.adapt(tweets)

# output = vectorizer([['kibic legia mioduski poznań wygrana przegrana']])
# print(output.numpy()[0, :6])

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

test = ['lipa', 'mecz', 'wyjazd', 'sezon']
[word_index[w] for w in test]

[15057, 43, 1043, 139]

In [5]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, idx in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[idx] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9936 words (8480 misses)


In [8]:
X_train = vectorizer(np.array([[s] for s in tweets])).numpy()
y_train = np.array(labels)

rf = RandomForestClassifier(n_estimators=15, max_depth=5)
knn = KNeighborsClassifier(n_neighbors=20)
svc = SVC()

rf.fit(X_train, y_train)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

# rf_preds = rf.predict(X_test)
# knn_preds = knn.predict(X_test)
# svc_preds = svc.predict(X_test)

rf_train_preds = rf.predict(X_train)
knn_train_preds = knn.predict(X_train)
svc_train_preds = svc.predict(X_train)

In [17]:
test_file = 'task6_test.zip'

if not os.path.isfile(test_file):
    url = 'http://publications.ics.p.lodz.pl/2016/word_embeddings/pl-embeddings-cbow.txt'
    wget.download(url)

def extract_test_files(zip_file, file_name_to_extract):
    with zipfile.ZipFile(zip_file) as z:
        with open(file_name_to_extract, 'wb') as f:
            f.write(z.read('Task6\task 01\test_set_clean_only_tags.txt'))
            print('Extracted', file_name_to_extract)
            f.close()
        z.close()

test_labels_file = 'test_set_clean_only_tags.txt'
test_text_file = 'test_set_clean_only_text.txt'
extract_test_files(test_file, test_labels_file)

KeyError: "There is no item named 'Task6\\task 01\\test_set_clean_only_tags.txt' in the archive"

In [7]:
print('Random Forest test predictions')
print(confusion_matrix(y_test, rf_preds))
print('\n\nK Nearest Neighbors test predictions')
print(confusion_matrix(y_test, knn_preds))
print('\n\nSupport Vector Machines test predictions')
print(confusion_matrix(y_test, svc_preds))

Random Forest test predictions


NameError: name 'y_test' is not defined

In [None]:
print('Random Forest train predictions')
print(confusion_matrix(y_train, rf_train_preds))
print('\n\nK Nearest Neighbors train predictions')
print(confusion_matrix(y_train, knn_train_preds))
print('\n\nSupport Vector Machines train predictions')
print(confusion_matrix(y_train, svc_train_preds))

In [None]:
print(f'Ranom Forest test f1: {round(f1_score(y_test, rf_preds)2)}')
print(f'Ranom Forest train f1: {round(f1_score(y_train, rf_train_preds), 2)}')

print(f'\nK Nearest Neighbors test f1: {round(f1_score(y_test, knn_preds), 2)}')
print(f'K Nearest Neighbors train f1: {round(f1_score(y_train, knn_train_preds), 2)}')

print(f'\nSupport Vector Machines test f1: {round(f1_score(y_test, svc_preds), 2)}')
print(f'Support Vector Machines train f1: {round(f1_score(y_train, svc_train_preds), 2)}')

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = keras.layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = keras.layers.MaxPooling1D(5)(x)
x = keras.layers.Conv1D(128, 5, activation="relu")(x)
x = keras.layers.MaxPooling1D(5)(x)
x = keras.layers.Conv1D(128, 5, activation="relu")(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dense(128, activation="relu")(x)
x = keras.layers.Dropout(0.5)(x)
preds = keras.layers.Dense(1, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

In [None]:
model.compile(
    loss='binary_crossentropy', optimizer='adam', metrics=['acc']
)
model.fit(X_train, y_train, batch_size=128, epochs=20)

In [39]:
embedding_size=32

model = Sequential(
                [
                Embedding(vocabulary_size, embedding_size, input_length=max_words)
                LSTM(150)
                LSTM(100)
                Dense(1, activation='sigmoid')
                print(model.summary())
                ]
)

model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20)

SyntaxError: invalid syntax (<ipython-input-39-42c4d9658a62>, line 6)

In [40]:
X_train.shape

(10041, 200)

In [14]:
len(tweets)

10041

In [16]:
embedding_matrix.shape

(18418, 100)

In [17]:
num_tokens

18418

In [44]:
from sklearn.metrics import confusion_matrix, f1_score