Рубежный контроль №2
Студент группы ИУ5-21М
Маматкулов Уткурбек

In [None]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
%matplotlib inline 
sns.set(style="ticks")

In [None]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [None]:
data = pd.read_csv('spam_classif.csv')

In [None]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.shape

(5572, 2)

In [None]:
#le = LabelEncoder()
category_columns = ['Category']
for col in category_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [None]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = data['Message'].tolist()
vocab_list[1:10]

['Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobil

In [None]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 8709


In [None]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

until=8080
jurong=4370
point=5954
crazy=2334
available=1313
only=5567
in=4110
bugis=1763
great=3651


# Векторизация текста на основе модели "мешка слов"

### Использование класса CountVectorizer

#### Подсчитывает количество слов словаря, входящих в данный текст.

In [None]:
test_features = vocabVect.transform(vocab_list)
test_features

<5572x8709 sparse matrix of type '<class 'numpy.int64'>'
	with 74098 stored elements in Compressed Sparse Row format>

In [None]:
test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

8709

In [None]:
# Непустые значения нулевой строки
[i for i in test_features.todense()[0].getA1() if i>0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
vocabVect.get_feature_names()[1000:1020]

['aid',
 'aids',
 'aig',
 'aight',
 'ain',
 'aint',
 'air',
 'air1',
 'airport',
 'airtel',
 'aiya',
 'aiyah',
 'aiyar',
 'aiyo',
 'ajith',
 'ak',
 'aka',
 'akon',
 'al',
 'alaikkum']

N-грамм

In [None]:
ncv = CountVectorizer(ngram_range=(1,3))
ngram_features = ncv.fit_transform(vocab_list)
ngram_features

<5572x104934 sparse matrix of type '<class 'numpy.int64'>'
	with 217339 stored elements in Compressed Sparse Row format>

In [None]:
len(ncv.get_feature_names())

104934

In [None]:
# Теперь признаками являются N-граммы
ncv.get_feature_names()[10000:10020]

['at mine just',
 'at moment',
 'at moment evone',
 'at moment yeah',
 'at mp3',
 'at mp3 player',
 'at mrt',
 'at mrt station',
 'at mu',
 'at mu and',
 'at mu in',
 'at mu you',
 'at my',
 'at my great',
 'at my house',
 'at my moms',
 'at my mum',
 'at my parents',
 'at my phone',
 'at my place']

### Использование класса TfidfVectorizer

In [None]:
tfidfv = TfidfVectorizer(ngram_range=(1,3))
tfidf_ngram_features = tfidfv.fit_transform(vocab_list)
tfidf_ngram_features

<5572x104934 sparse matrix of type '<class 'numpy.float64'>'
	with 217339 stored elements in Compressed Sparse Row format>

In [None]:
tfidf_ngram_features.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Размер нулевой строки
len(tfidf_ngram_features.todense()[0].getA1())

104934

In [None]:
# Непустые значения нулевой строки
[i for i in tfidf_ngram_features.todense()[0].getA1() if i>0]

[0.1537647471633528,
 0.1537647471633528,
 0.11501101620597226,
 0.1537647471633528,
 0.1537647471633528,
 0.14678507188449502,
 0.1537647471633528,
 0.1537647471633528,
 0.12990107976500484,
 0.1537647471633528,
 0.1537647471633528,
 0.12990107976500484,
 0.1537647471633528,
 0.1537647471633528,
 0.11908021199485914,
 0.1537647471633528,
 0.1537647471633528,
 0.06964712666694571,
 0.1537647471633528,
 0.1537647471633528,
 0.07208549828613442,
 0.1537647471633528,
 0.1537647471633528,
 0.08493973105393231,
 0.1537647471633528,
 0.1537647471633528,
 0.050435116339348454,
 0.1537647471633528,
 0.1537647471633528,
 0.1537647471633528,
 0.1537647471633528,
 0.1537647471633528,
 0.12990107976500484,
 0.1537647471633528,
 0.1537647471633528,
 0.07365148623113625,
 0.14678507188449502,
 0.1537647471633528,
 0.12026785509880393,
 0.1537647471633528,
 0.1537647471633528,
 0.07324643231209632,
 0.1537647471633528,
 0.1537647471633528,
 0.10833602139962993,
 0.1537647471633528,
 0.153764747163352

## Решение задачи классификации спама на основе модели "мешка слов"

С использованием кросс-валидации попробуем применить к корпусу текстов различные варианты векторизации и классификации.

In [None]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, data['Message'], data['Category'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [None]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [RandomForestClassifier(), ComplementNB()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '000pes': 2, '008704050406': 3,
                            '0089': 4, '0121': 5, '01223585236': 6,
                            '01223585334': 7, '0125698789': 8, '02': 9,
                            '0207': 10, '02072069400': 11, '02073162414': 12,
                            '02085076972': 13, '021': 14, '03': 15, '04': 16,
                            '0430': 17, '05': 18, '050703': 19, '0578': 20,
                            '06': 21, '07': 22, '07008009200': 23,
                            '07046744435': 24, '07090201529': 25,
                            '07090298926': 26, '07099833605': 27,
                            '07123456789': 28, '0721072': 29, ...})
Модель для классификации - RandomForestClassifier()
Accuracy = 0.9725405031708299
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '000pes': 2, '008704050406': 3,
                            '0089': 4, '0121': 5, '01223585236': 6,
                       

## Разделим выборку на обучающую и тестовую и проверим решение для лучшей модели

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.5, random_state=1)

In [None]:
def spam(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [None]:
spam(TfidfVectorizer(), RandomForestClassifier())

Метка 	 Accuracy
0 	 0.9983354140657511
1 	 0.7911227154046997


In [None]:
spam(TfidfVectorizer(ngram_range=(1,3)), RandomForestClassifier())

Метка 	 Accuracy
0 	 1.0
1 	 0.6657963446475196


In [None]:
spam(TfidfVectorizer(ngram_range=(2,3)), RandomForestClassifier())

Метка 	 Accuracy
0 	 1.0
1 	 0.6161879895561357


In [None]:
spam(TfidfVectorizer(ngram_range=(1,4)), RandomForestClassifier())

Метка 	 Accuracy
0 	 1.0
1 	 0.639686684073107


In [None]:
spam(TfidfVectorizer(ngram_range=(2,4)), RandomForestClassifier())

Метка 	 Accuracy
0 	 1.0
1 	 0.5509138381201044


Таким образом, TFidf вариант векторизации признаков в паре с RandomForestClassifier классификатором показал лучшее качество. Точность составила 0.975412422357128.