In [5]:
import pandas as pd
import matplotlib
import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy.stats as st
import seaborn as sns
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.stats import ttest_ind
from pylab import rcParams
rcParams['figure.figsize'] = 11,7

**Задание 1**.
Реализация базовой модели логистической регрессии для классификации текстовых сообщений по признаку спама.

In [6]:
df= pd.read_csv('https://raw.githubusercontent.com/obulygin/pyda_homeworks/master/stat_case_study/spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
### приводим к нижнему регистру, чистим от знаков препинания, сплитуем сообщениям
df.Message = df.Message.str.lower().apply(lambda x: re.sub('[\W_]+', ' ',x)).apply(lambda x: x.split())
df['len']=df.Message.apply(len)
df

Unnamed: 0,Category,Message,len
0,ham,"[go, until, jurong, point, crazy, available, o...",20
1,ham,"[ok, lar, joking, wif, u, oni]",6
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",33
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",11
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",14
...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",32
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",8
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...",10
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",27


In [8]:
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

### удаляем стопслова и лемматизируем их
df.Message=df.Message.apply(lambda x: [word for word in x if word not in stopwords_set])
df['len']=df.Message.apply(len)
df.Message = df.Message.apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])

### соединим слова обратно в текст, так как векторайзер не принимает списки 
df.Message = df.Message.apply(lambda x: ' '.join(x))
df

[nltk_data] Downloading package omw-1.4 to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Category,Message,len
0,ham,go jurong point crazy available bugis n great ...,16
1,ham,ok lar joking wif u oni,6
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,25
3,ham,u dun say early hor u c already say,9
4,ham,nah think go usf life around though,7
...,...,...,...
5567,spam,2nd time tried 2 contact u u 750 pound prize 2...,22
5568,ham,ü b going esplanade fr home,6
5569,ham,pity mood suggestion,3
5570,ham,guy bitching acted like interested buying some...,13


In [9]:
### векторизуем наши сообщения
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df.Message)
names = tfidf.get_feature_names()
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)
tfidf_matrix

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
### создаем и обучаем моель логистической регрессии
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix,  df.Category, test_size=0.30, random_state=42)
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)

### проверим точность модели
score = logisticRegr.score(X_test, y_test)
print(score)

0.958732057416268


In [11]:
### создаем матрицу путанницы
from sklearn import metrics

cm = pd.DataFrame(metrics.confusion_matrix(y_test, predictions),columns=['Предсказан не спам','Предсказан спам'],index=['Не спам', 'Спам'])
cm

Unnamed: 0,Предсказан не спам,Предсказан спам
Не спам,1445,3
Спам,66,158


In [12]:
### соединим ytest и предсказания для y по xtest и вытащим из них то, что не совпало
result = pd.DataFrame([y_test.reset_index()['index'],y_test.reset_index()['Category'], logisticRegr.predict(X_test)]).T
relations_result = result[((result['Category']=='ham')&(result['Unnamed 0']!='ham'))\
      |((result['Category']=='spam')&(result['Unnamed 0']!='spam'))]

relations_result

Unnamed: 0,index,Category,Unnamed 0
17,2952,ham,spam
40,881,spam,ham
47,1961,spam,ham
74,3864,spam,ham
84,2575,spam,ham
...,...,...,...
1525,4543,spam,ham
1567,752,spam,ham
1569,309,spam,ham
1576,495,ham,spam
