In [24]:
# Python ‚â•3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ‚â•0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [25]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "D:/ML/"
print(DOWNLOAD_ROOT)
DATASET_PATH = os.path.join("datasets", "spam")
DIR_PATH = {"Africa":os.path.join(DATASET_PATH, "Africa"),
            "China":os.path.join(DATASET_PATH, "China"),
            "MiddleEAST":os.path.join(DATASET_PATH, "MiddleEAST"),
           "LatAmerica":os.path.join(DATASET_PATH, "LatAmerica"),
           "IranPakistanAfganistan":os.path.join(DATASET_PATH, "IranPakistanAfganistan"),}
filenames = {}
for items in DIR_PATH.keys():
    print(items)
    filenames[items] = [name for name in sorted(os.listdir(DIR_PATH[items])) if len(name) > 20] 

D:/ML/
Africa
China
MiddleEAST
LatAmerica
IranPakistanAfganistan


In [26]:
from charset_normalizer import from_path
import string

def remove_chars_from_text(text, chars):
    # return "".join([ch for ch in text if ch not in chars])
    content = ''
    for ch in text:
        if ch not in chars:
            content = content + ''.join(ch)
        else:
            content = content + ''.join(' ')
    return content

import re

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)
spec_chars = string.punctuation + '\r' + '\n\xa0¬´¬ª\t‚Äî‚Ä¶' 

x_temp = []
y_temp = []
for item in filenames.keys():
    post_temp=[]
    for names in filenames[item]:
        content = str(from_path(DIR_PATH[item]+"\\"+names).best()).lower()  
        content = remove_emojis(content)
        content = re.sub(r'(\\u[0-9A-Fa-f]+)', lambda matchobj: chr(int(matchobj.group(0)[2:], 16)), content)
        content = remove_chars_from_text(content, spec_chars)
        content = remove_chars_from_text(content, string.digits)
        for i in range(1, 5):
            content = content.replace('  ', ' ')
        x_temp.append(content)
        y_temp.append(item)

In [27]:
import numpy as np
from sklearn.model_selection import train_test_split


X = np.array(x_temp, dtype=object)
y = np.array(y_temp)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print(X_train[:7])
print(y_train[:7])

[' –∫–æ–Ω–≥–æ —Å –ø–æ –∏—é–ª—è –ø—Ä–∏–º–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –ª–∏–≤–∏–π—Å–∫–∏—Ö –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–∏—Ö —Å—Ç–æ—Ä–æ–Ω –Ω–∞—Ü–µ–ª–µ–Ω–Ω—ã–µ –Ω–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–∏–µ –ø—É—Ç–∏ –ø–æ –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–º—É –ø—Ä–∏–º–∏—Ä–µ–Ω–∏—é –≤ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–∞—Ö –∫–æ—Ç–æ—Ä—ã–µ –ø—Ä–æ–π–¥—É—Ç –≤ —Å—Ç–æ–ª–∏—Ü–µ –∫–æ–Ω–≥–æ –±—Ä–∞–∑–∑–∞–≤–∏–ª–µ –ø—Ä–∏–º—É—Ç —É—á–∞—Å—Ç–∏–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ–ª–∏ –ª–∏–≤–∏–π—Å–∫–æ–≥–æ –≥–ª–∞–≤–Ω–æ–∫–æ–º–∞–Ω–¥–æ–≤–∞–Ω–∏—è –ø–∞–ª–∞—Ç—ã –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ–ª–µ–π –≤–µ—Ä—Ö–æ–≤–Ω–æ–≥–æ –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ —Å–æ–≤–µ—Ç–∞ –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—Å–∫–æ–≥–æ —Å–æ–≤–µ—Ç–∞ –ø–Ω–µ –∞ —Ç–∞–∫–∂–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ–ª–∏ —Å–µ–π—Ñ–∞ –∏—Å–ª–∞–º–∞ –∫–∞–¥–¥–∞—Ñ–∏ –∏ –≥—Ä—É–ø–ø—ã –∏–∑ –≥–µ–¥–∞–º–µ—Å–∞ –º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –ø–æ–π–¥—É—Ç –ø–æ–¥ —ç–≥–∏–¥–æ–π –∞—Ñ—Ä–∏–∫–∞–Ω—Å–∫–æ–≥–æ —Å–æ—é–∑–∞ –ø–ª–∞–Ω–∏—Ä—É–µ—Ç—Å—è –∏–∑—É—á–µ–Ω–∏–µ –∞–Ω–∞–ª–æ–≥–∏—á–Ω–æ–≥–æ –æ–ø—ã—Ç–∞ —Ç–µ—Ö —Å—Ç—Ä–∞–Ω –≤ –∫–æ—Ç–æ—Ä—ã—Ö –ø—Ä–æ—Ü–µ—Å—Å –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ –ø—Ä–∏–º–∏—Ä–µ–Ω–∏—è –æ–∫–∞–

In [29]:
len(X_train)

5524

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
import urlextract # may require an Internet connection to download root domain names
import re
from collections import Counter
import nltk
from tokenizer_exceptions import normalizer_exc_rus
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

stemmer = nltk.PorterStemmer()
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True, remove_stopwords=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        self.remove_stopwords = remove_stopwords
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email or ""
            text = normalizer_exc_rus(text).lower()
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.remove_stopwords:
                filtered_words = [token.text for token in text if not russian_stopwords]
 
                # Join the filtered words to form a clean text
                text = ' '.join(filtered_words)
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [31]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
except ImportError:
    url_extractor = None

In [32]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=4000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [33]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=5000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[CV] END ................................ score: (test=0.885) total time=   2.0s
[CV] END ................................ score: (test=0.882) total time=   2.2s
[CV] END ................................ score: (test=0.873) total time=   2.2s


0.8803394078223471

In [35]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=6000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred, average='micro')))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred, average='micro')))

Precision: 90.30%
Recall: 90.30%


In [36]:
new_text = ['–ù–∏–≥–µ—Ä –ø–æ—Å–ª–µ–¥–æ–≤–∞–ª –ø—Ä–∏–º–µ—Ä—É –ú–∞–ª–∏, —Ä–∞–∑–æ—Ä–≤–∞–≤ —Å–≤—è–∑–∏ —Å –£–∫—Ä–∞–∏–Ω–æ–π –ø–æ—Å–ª–µ —Ç–æ–≥–æ, –∫–∞–∫ –ø—Ä–µ—Å—Å-—Å–µ–∫—Ä–µ—Ç–∞—Ä—å —Ä–∞–∑–≤–µ–¥—ã–≤–∞—Ç–µ–ª—å–Ω–æ–≥–æ –∞–≥–µ–Ω—Ç—Å—Ç–≤–∞ –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–∞ –æ–±–æ—Ä–æ–Ω—ã –£–∫—Ä–∞–∏–Ω—ã –ø—Ä–∏–∑–Ω–∞–ª, —á—Ç–æ –£–∫—Ä–∞–∏–Ω–∞ –æ–∫–∞–∑—ã–≤–∞–ª–∞ –ø–æ–¥–¥–µ—Ä–∂–∫—É –ø–æ–≤—Å—Ç–∞–Ω—Ü–∞–º, –∫–æ—Ç–æ—Ä—ã–µ —É–±–∏–ª–∏ –º–∞–ª–∏–π—Å–∫–∏—Ö –≤–æ–æ—Ä—É–∂–µ–Ω–Ω—ã—Ö —Å–∏–ª –∏ —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö –∞–≥–µ–Ω—Ç–æ–≤ 25-27 –∏—é–ª—è –≤ –¢–∏–Ω–∑–∞—É–∞—Ç–µ–Ω–µ, –Ω–∞ —Å–µ–≤–µ—Ä–µ –ú–∞–ª–∏, –Ω–µ–¥–∞–ª–µ–∫–æ –æ—Ç –≥—Ä–∞–Ω–∏—Ü—ã —Å –ê–ª–∂–∏—Ä–æ–º.–ü–æ—Å–æ–ª –£–∫—Ä–∞–∏–Ω—ã –≤ –°–µ–Ω–µ–≥–∞–ª–µ —Ç–∞–∫–∂–µ –≤—ã—Ä–∞–∑–∏–ª –±–µ–∑–æ–≥–æ–≤–æ—Ä–æ—á–Ω—É—é –ø–æ–¥–¥–µ—Ä–∂–∫—É –º–∞–ª–∏–π—Å–∫–∏–º –ø–æ–≤—Å—Ç–∞–Ω—Ü–∞–º, —á—Ç–æ –ø—Ä–∏–≤–µ–ª–æ –∫ –≤—ã–∑–æ–≤—É –ø–æ—Å–ª–∞ –≤ –ú–ò–î –°–µ–Ω–µ–≥–∞–ª–∞.',
            '–ï—â–µ –Ω–æ–≤–æ—Å—Ç–∏ –∞—Ñ–≥–∞–Ω—Å–∫–æ–π –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ—Å—Ç–∏. –í –ö–∞–Ω–¥–∞–≥–∞—Ä–µ —Å–ø—É—Å—Ç—è 18 –ª–µ—Ç –≤–æ–∑–æ–±–Ω–æ–≤–∏–ª–∞ —Ä–∞–±–æ—Ç—É —Ç–µ–∫—Å—Ç–∏–ª—å–Ω–∞—è —Ñ–∞–±—Ä–∏–∫–∞. –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–µ –ª–∏—Ü–∞ –≥–æ–≤–æ—Ä—è—Ç, —á—Ç–æ –æ–Ω–∏ –æ—Ç—Ä–µ–º–æ–Ω—Ç–∏—Ä–æ–≤–∞–ª–∏ –æ–±–æ—Ä—É–¥–æ–≤–∞–Ω–∏–µ –Ω–∞ —Ñ–∞–±—Ä–∏–∫–µ –∑–∞ —à–µ—Å—Ç—å –º–µ—Å—è—Ü–µ–≤, —Å–æ–æ–±—â–∞—é—Ç –∞—Ñ–≥–∞–Ω—Å–∫–æ–µ –°–ú–ò. –ü–æ —Å–ª–æ–≤–∞–º —Ä–∞–±–æ—Ç–Ω–∏–∫–æ–≤ —Ñ–∞–±—Ä–∏–∫–∏, —Å –≤–æ–∑–æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ–º —Ä–∞–±–æ—Ç—ã —Ñ–∞–±—Ä–∏–∫–∏ —Å–æ–∑–¥–∞–Ω—ã —Ä–∞–±–æ—á–∏–µ –º–µ—Å—Ç–∞. 69-–ª–µ—Ç–Ω–∏–π –ú–æ—Ö–∞–º–º–∞–¥, –ø—Ä–æ—Ä–∞–±–æ—Ç–∞–≤—à–∏–π –Ω–∞ —Ç–µ–∫—Å—Ç–∏–ª—å–Ω–æ–π —Ñ–∞–±—Ä–∏–∫–µ –≤ –ö–∞–Ω–¥–∞–≥–∞—Ä–µ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ–µ –≤—Ä–µ–º—è, —Ä–∞–¥ –≤–µ—Ä–Ω—É—Ç—å—Å—è –Ω–∞ —Ä–∞–±–æ—Ç—É.',
           '–ù–∞–∫–æ–ø–ª–µ–Ω–Ω–æ–µ –Ω–µ–ø–æ–Ω–∏–º–∞–Ω–∏–µ –º–µ–∂–¥—É –ö–∏—Ç–∞–µ–º –∏ –°–®–ê –Ω–µ –º–æ–∂–µ—Ç –±—ã—Ç—å —Ä–µ—à–µ–Ω–æ –ª–∏—à—å –æ–¥–Ω–∏–º –ø–æ–¥–æ–±–Ω—ã–º –∏–∑—è—â–Ω—ã–º –∂–µ—Å—Ç–æ–º. –°—Ç—Ä–∞–Ω—ã —Ä–∞—Å—Ö–æ–¥—è—Ç—Å—è –ø–æ —à–∏—Ä–æ–∫–æ–º—É –∫—Ä—É–≥—É –≤–æ–ø—Ä–æ—Å–æ–≤, —Å–∞–Ω–∫—Ü–∏–æ–Ω–Ω—ã–π –º–µ—Ö–∞–Ω–∏–∑–º –ø—Ä–æ—Ç–∏–≤ –ö–∏—Ç–∞—è –Ω–µ –æ—Å–ª–∞–±–ª–µ–Ω. –ü–æ—ç—Ç–æ–º—É –æ—Ç–ø—Ä–∞–≤–∫–∞ –ø–∞–Ω–¥ ‚Äì —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏—è –º–∏—Ä–æ–ª—é–±–∏–≤–æ–π –ø–æ–∑–∏—Ü–∏–∏ –ö–∏—Ç–∞—è –≤ –ø—Ä–æ—Ç–∏–≤–æ–≤–µ—Å –°–®–ê.',
            '–í –ê–ª–∂–∏—Ä–µ, –∫–∞–∫ –ø—Ä–∞–≤–∏–ª–æ, –Ω–∞ –ø–æ—Ö–æ—Ä–æ–Ω–∞—Ö –º—É–∂—á–∏–Ω—ã —Å–∏–¥—è—Ç –æ–∫–æ–ª–æ –¥–æ–º–∞, —Ö–æ—Ä–æ—à–æ, –µ—Å–ª–∏ –µ—Å—Ç—å —Å–∞–¥ –∏–ª–∏ —Ç–µ—Ä—Ä–∞—Å–∞, –∞ –∂–µ–Ω—â–∏–Ω—ã –≤ –¥–æ–º–µ, –Ω–∞–¥–µ—Ç—å –ø–ª–∞—Ç–æ–∫ –∂–µ–Ω—â–∏–Ω–∞ –¥–æ–ª–∂–Ω–∞ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ, –¥–∞–∂–µ –µ—Å–ª–∏ –≤ –æ–±—ã—á–Ω–æ–π –∂–∏–∑–Ω–∏ –æ–Ω–∞ –Ω–µ –ø–æ–∫—Ä—ã–≤–∞–µ—Ç –≥–æ–ª–æ–≤—É. –ï—Å–ª–∏ –ø–æ–∫–æ–π–Ω—ã–π –∂–∏–ª –≤ –∫–≤–∞—Ä—Ç–∏—Ä–µ, —Ç–æ –ø—Ä—è–º–æ –≤–æ –¥–≤–æ—Ä–µ —Å—Ç–∞–≤—è—Ç —Å—Ç—É–ª—å—è –∏ —à–∞—Ç—Ä—ã —Å–æ —Å—Ç–æ–ª–∞–º–∏ –¥–ª—è –º—É–∂—á–∏–Ω.–ì–æ—Ç–æ–≤—è—Ç –ª–∏–±–æ –Ω–∞–Ω—è—Ç—ã–µ –∫—É—Ö–∞—Ä–∫–∏, –ª–∏–±–æ —Ä–æ–¥—Å—Ç–≤–µ–Ω–Ω–∏—Ü—ã –ø–æ–∫–æ–π–Ω–æ–≥–æ. –ü–æ–∫–∞ —Å–æ—Å—Ç–∞–≤–ª—è–ª–∞ –ø–æ—Å—Ç –æ—Ç –ø–æ–¥—Ä—É–≥–∏ —É—Å–ª—ã—à–∞–ª–∞,—á—Ç–æ –≤ –∏—Ö —Å–µ–º—å–µ —á–∞—Å—Ç–æ –µ–¥—É –ø—Ä–∏–Ω–æ—Å—è—Ç —Ç–µ, –∫—Ç–æ –ø—Ä–∏—Ö–æ–¥–∏—Ç –≤ –¥–æ–º, –∞ –≥–æ—Ç–æ–≤–∏—Ç—å –¥–æ–ª–∂–Ω—ã –Ω–µ–≤–µ—Å—Ç–∫–∏, –∞ –Ω–µ –¥–æ—á–µ—Ä–∏ —É–º–µ—Ä—à–µ–≥–æ. –ú–æ–π –º—É–∂ —Å–∫–∞–∑–∞–ª, —á—Ç–æ —ç—Ç–æ —Å–æ–≤–µ—Ä—à–µ–Ω–Ω–æ –Ω–µ–æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ, –Ω–∞ –ø–æ—Ö–æ—Ä–æ–Ω–∞—Ö –µ–≥–æ –±–∞–±—É—à–∫–∏ –≥–æ—Ç–æ–≤–∏–ª–∏ —Ç–æ–ª—å–∫–æ –µ—ë –¥–æ—á–µ—Ä–∏, —ç—Ç–æ –±—ã–ª–æ –∏—Ö –∂–µ–ª–∞–Ω–∏–µ –∏ –Ω–∏–∫–∞–∫–∏—Ö –æ—Å–æ–±—ã—Ö –ø—Ä–∞–≤–∏–ª –Ω–∞ —Å—á—ë—Ç —ç—Ç–æ–≥–æ –Ω–µ—Ç. –û–±—è–∑–∞—Ç–µ–ª—å–Ω–æ–≥–æ –±–ª—é–¥–∞ –Ω–∞ –ø–æ–º–∏–Ω–∫–∏ –≤ –ê–ª–∂–∏—Ä–µ –Ω–µ—Ç. –ù–∞—Ä–æ–¥ –ø—Ä–∏—Ö–æ–¥–∏—Ç –ø–æ–º—è–Ω—É—Ç—å–æ—Ç —Ç—Ä–µ—Ö –¥–Ω–µ–π –¥–æ –Ω–µ–¥–µ–ª–∏, –µ—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –±—ã–ª–∞ –±–æ–ª—å—à–∞—è —Å–µ–º—å—è –∏ –º–Ω–æ–≥–æ –∑–Ω–∞–∫–æ–º—ã—Ö!–ö–æ—Ä–º—è—Ç —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–º–∏ –±–ª—é–¥–∞–º–∏: —Å—É–ø-—à–æ—Ä–±–∞, –∫—É—Å-–∫—É—Å/–±–µ—Ä–∫—É–∫–µ—Å/—Ç–ª–∏—Ç–ª–∏ —Å –º—è—Å–æ–º –∏ —Ç.–¥.–ú–æ–≥–∏–ª—É –ø–æ—Å–µ—â–∞—é—Ç –∫–∞–∂–¥–æ–µ —É—Ç—Ä–æ, –≤ —Ç–µ—á–µ–Ω–∏–∏ —Ç—Ä–µ—Ö –¥–Ω–µ–π. –ß–µ—Ä–µ–∑ 40 –¥–Ω–µ–π —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—é—Ç –Ω–µ–±–æ–ª—å—à–æ–µ –Ω–∞–¥–≥—Ä–æ–±–∏–µ, –Ω–∏–∫–∞–∫–∏—Ö –ø–∞–º—è—Ç–Ω–∏–∫–æ–≤,–≤—ã—á—É—Ä–Ω—ã—Ö —ç–ª–µ–º–µ–Ω—Ç–æ–≤. –ù–∞ –Ω–∞–¥–≥—Ä–æ–±–∏–∏ –∏–º—è, —Ñ–∞–º–∏–ª–∏—è, –¥–∞—Ç—ã —Ä–æ–∂–¥–µ–Ω–∏—è –∏ —Å–º–µ—Ä—Ç–∏, —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∏–π –Ω–µ—Ç. –í—Ä–æ–¥–µ –±—ã –Ω–∞–ø–∏—Å–∞–ª–∞ –≤—Å—ë, —á—Ç–æ –∑–Ω–∞–ª–∞. –ó–∞–¥–∞–≤–∞–π—Ç–µ –≤–æ–ø—Ä–æ—Å—ã –≤ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è—Ö, –µ—Å–ª–∏ —Ç–∞–∫–æ–≤—ã–µ –∏–º–µ—é—Ç—Å—è.',
            '–°–µ–≥–æ–¥–Ω—è –≤ –í–µ–Ω–µ—Å—É—ç–ª–µ –ø—Ä–æ—Ö–æ–¥—è—Ç –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—Å–∫–∏–µ –≤—ã–±–æ—Ä—ã. –ú–æ–π –∫–æ–ª–ª–µ–≥–∞ –î–º–∏—Ç—Ä–∏–π –ú–æ—Ä–æ–∑–æ–≤ —Ä–∞—Å—Å—É–∂–¥–∞–µ—Ç, –∫–∞–∫ –∏—Ö –∏—Ç–æ–≥–∏ –º–æ–≥—É—Ç –ø–æ–≤–ª–∏—è—Ç—å –Ω–∞ –¥–∞–ª—å–Ω–µ–π—à–∏–π –ø—É—Ç—å —Å—Ç—Ä–∞–Ω—ã. –í–æ—Ç —Å–∞–º–æ–µ –≤–∞–∂–Ω–æ–µ –∏–∑ –µ–≥–æ —Å—Ç–∞—Ç—å–∏.üìç–í –≤—ã–±–æ—Ä–∞—Ö –æ–¥–∏–Ω —Ç—É—Ä, —É—á–∞—Å—Ç–≤—É—é—Ç –¥–µ—Å—è—Ç—å –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤, –æ–¥–Ω–∞–∫–æ —Ç–æ–ª—å–∫–æ –¥–≤–æ–µ –∏–º–µ—é—Ç —Ä–µ–∞–ª—å–Ω—ã–µ —à–∞–Ω—Å—ã –Ω–∞ –ø–æ–±–µ–¥—É: –¥–µ–π—Å—Ç–≤—É—é—â–∏–π –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç –ù–∏–∫–æ–ª–∞—Å –ú–∞–¥—É—Ä–æ –∏ –≠–¥–º—É–Ω–¥–æ –ì–æ–Ω—Å–∞–ª–µ—Å –£—Ä—Ä—É—Ç–∏—è, –∫–∞–Ω–¥–∏–¥–∞—Ç –æ—Ç –ï–¥–∏–Ω–æ–π –¥–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–æ–π –ø–ª–∞—Ç—Ñ–æ—Ä–º—ã, –æ–±—ä–µ–¥–∏–Ω—è—é—â–µ–π –Ω–∞–∏–±–æ–ª–µ–µ –∑–Ω–∞—á–∏–º—ã–µ –æ–ø–ø–æ–∑–∏—Ü–∏–æ–Ω–Ω—ã–µ –ø–∞—Ä—Ç–∏–∏.'
           ]
accuracy = log_clf.predict(preprocess_pipeline.transform(new_text))

print (accuracy)

–∏ —Ç.–¥. - –∏ —Ç–∞–∫ –¥–∞–ª–µ–µ
–ª—é–¥–∞–º–∏: —Å—É–ø-—à–æ—Ä–±–∞, –∫—É—Å-–∫—É—Å/–±–µ—Ä–∫—É–∫–µ—Å/—Ç–ª–∏—Ç–ª–∏ —Å –º—è—Å–æ–º –∏ —Ç.–¥.–ú–æ–≥–∏–ª—É –ø–æ—Å–µ—â–∞—é—Ç –∫–∞–∂–¥–æ–µ —É—Ç—Ä–æ, –≤ —Ç–µ—á–µ–Ω–∏–∏ —Ç—Ä–µ—Ö –¥–Ω–µ–π. –ß–µ—Ä–µ–∑ 40 –¥–Ω–µ–π —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—é—Ç –Ω–µ–±–æ–ª—å—à–æ–µ –Ω–∞–¥–≥—Ä
['Africa' 'Africa' 'China' 'MiddleEAST' 'LatAmerica']
