<a href="https://colab.research.google.com/github/Vakhranev/Compling/blob/master/%D0%94%D0%BE%D0%BC%D0%B0%D1%88%D0%BA%D0%B0_%E2%84%963.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
categories = ['comp.os.ms-windows.misc', 'misc.forsale', 'rec.autos', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
vectorizer = CountVectorizer()
vect = vectorizer.fit_transform(train.data)
print(vect.shape)

(2363, 63636)


In [78]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from statistics import mean

X = vect
y = train.target
print(X.shape, y.shape)
X, X_test, y, y_test = train_test_split(X, y, random_state=42)
stratified_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

(2363, 63636) (2363,)


In [0]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.linear_model import SGDClassifier
lr = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
sgd = SGDClassifier()
parameter_grid_lr = {'class_weight' : ['balanced', None],
                  'C' : [0.01, 0.1, 1.0, 10.0],
                  'max_iter': [300,1000]
                 }
parameters_RandomForest = {'n_estimators': [5, 10, 15],
                           'max_depth': [None, 5],
                           'min_samples_split': [0.5, 1.0, 5]                          
                }
parameters_SGD = {'penalty': ['l2', 'l1'], 
                  'alpha': [0.0001, 0.001, 0.01, 0.1], 
                  'max_iter': [10000, 15000, 20000]
                }

In [0]:
from sklearn.model_selection import GridSearchCV
def grid_search(model, parameter_grid):
    grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=stratified_folds, scoring='f1_macro')
    grid_search.fit(X, y)
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [6]:
grid_search(lr, parameter_grid_lr)

Best score: 0.9342816081032079
Best parameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 300}


In [7]:
grid_search(rf, parameters_RandomForest)

Best score: 0.8483865285404203
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 15}


In [8]:
grid_search(sgd, parameters_SGD)

Best score: 0.9436079335790936
Best parameters: {'alpha': 0.1, 'max_iter': 10000, 'penalty': 'l2'}


In [0]:
lr = linear_model.LogisticRegression(C=1.0, class_weight='balanced', max_iter=300)
rf = ensemble.RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=15)
sgd = SGDClassifier(alpha=0.1, max_iter=15000, penalty='l2')

In [0]:
def cv_train_model(model):
    valid_scores = []
    for fold_n, (train_index, valid_index) in enumerate(stratified_folds.split(X, y)):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        model.fit(X_train, y_train)      
        y_pred_test = model.predict(X_test)
        y_pred_valid = model.predict(X_valid)
        valid_score = sklearn.metrics.f1_score(y_valid, y_pred_valid, average='macro')
        valid_scores.append(valid_score)
    test_score = sklearn.metrics.f1_score(y_test, y_pred_test, average='macro')
    print('Cross Validation mean score:', mean(valid_scores))
    print("Score on test data: {0:.4f}".format(test_score))

In [11]:
cv_train_model(lr)

Cross Validation mean score: 0.934281608103208
Score on test data: 0.9469


In [12]:
cv_train_model(rf)

Cross Validation mean score: 0.8512245741630762
Score on test data: 0.8542


In [13]:
cv_train_model(sgd)

Cross Validation mean score: 0.9403543517218882
Score on test data: 0.9570


In [14]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 18.2MB/s eta 0:00:01[K     |██████▏                         | 20kB 2.9MB/s eta 0:00:01[K     |█████████▎                      | 30kB 4.0MB/s eta 0:00:01[K     |████████████▍                   | 40kB 4.2MB/s eta 0:00:01[K     |███████████████▌                | 51kB 3.3MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 3.7MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 4.0MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 4.3MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 4.6MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 4.5MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 4.5MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
lr_fit = lr.fit(X,y)
rf_fit = rf.fit(X,y)
sgd_fit = sgd.fit(X,y)

In [46]:
df = eli5.formatters.as_dataframe.explain_weights_df(lr_fit)
df.head()

Unnamed: 0,target,feature,weight
0,0,x60354,1.28965
1,0,x26825,0.535892
2,0,x23870,0.461181
3,0,x57860,0.391514
4,0,x23869,0.388154


In [47]:
df = eli5.formatters.as_dataframe.explain_weights_df(rf_fit)
df.head()

Unnamed: 0,feature,weight,std
0,x60354,0.023315,0.047574
1,x51060,0.018894,0.025056
2,x19137,0.015663,0.033483
3,x55353,0.011818,0.018296
4,x19209,0.008473,0.016789


In [0]:
import eli5  
import pandas as pd
def analyze_features(model, n):
  index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
  df = eli5.formatters.as_dataframe.explain_weights_df(model)
  category1 = df[df['target']==0]
  category2 = df[df['target']==1]
  category3 = df[df['target']==2]
  category4 = df[df['target']==3]
  print(' comp.os.ms-windows.misc:')
  for word in category1.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])
  print('\n','misc.forsale:')
  for word in category2.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])
  print('\n','rec.autos:')
  for word in category3.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    if priznak == '<BIAS>':
      print(priznak)
    else:
      print(index_to_word[int(priznak)])
  print('\n','sci.space:')
  for word in category4.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])

In [26]:
analyze_features(lr_fit, 10)

 comp.os.ms-windows.misc:
windows
file
drivers
using
driver
win
files
help
microsoft
with

 misc.forsale:
sale
wanted
for
shipping
offer
forsale
interested
distribution
sell
state

 rec.autos:
car
cars
<BIAS>
honda
bmw
testing
integra
automotive
virginia
just

 sci.space:
space
orbit
nasa
planets
digex
ryukoku
sci
pat
jennise
dgi


In [28]:
analyze_features(sgd_fit, 10)

 comp.os.ms-windows.misc:
ax
windows
file
max
drivers
using
driver
files
ftp
win

 misc.forsale:
sale
wanted
for
offer
shipping
forsale
sell
call
new
interested

 rec.autos:
car
cars
bmw
honda
dealer
ford
engine
automotive
toyota
re

 sci.space:
space
orbit
nasa
pat
digex
alaska
spacecraft
launch
sci
earth


In [43]:
import statistics
priznaki = rf_fit.feature_importances_
real = [i for i in priznaki if i]
real_mean = statistics.mean(real)
print(len([i for i in real if i > 9*real_mean]))

40


In [45]:
rf_priznaki = []
index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
for i in range(len(priznaki)):
    if priznaki[i] > 9*real_mean:
        rf_priznaki.append(index_to_word[i])
rf_priznaki

['anyone',
 'article',
 'asking',
 'be',
 'by',
 'car',
 'card',
 'cars',
 'condition',
 'dos',
 'engine',
 'file',
 'forsale',
 'gov',
 'how',
 'in',
 'is',
 'moon',
 'ms',
 'my',
 'of',
 'offer',
 'on',
 'out',
 'pat',
 'please',
 're',
 'road',
 'sale',
 'sci',
 'shipping',
 'shuttle',
 'space',
 'spencer',
 'thanks',
 'that',
 'the',
 'use',
 'windows',
 'writes']

In [79]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = stopwords.words('english')
best_vectorizer = sklearn.feature_extraction.text.CountVectorizer(stop_words=stops, min_df=0.005, max_df=0.4, max_features=5000)
best_vect = best_vectorizer.fit_transform(train.data)
X = best_vect
y = train.target
print(X.shape, y.shape)
X, X_test, y, y_test = train_test_split(X, y, random_state=42)
stratified_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
(2363, 3692) (2363,)


In [0]:
lr = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
sgd = SGDClassifier()
parameter_grid_lr = {'class_weight' : ['balanced', None],
                  'C' : [0.01, 0.1, 1.0, 10.0],
                  'max_iter': [300,1000]
                 }
parameters_RandomForest = {'n_estimators': [5, 10, 15],
                           'max_depth': [None, 5],
                           'min_samples_split': [0.5, 1.0, 5]                          
                }
parameters_SGD = {'penalty': ['l2', 'l1'], 
                  'alpha': [0.0001, 0.001, 0.01, 0.1], 
                  'max_iter': [10000, 15000, 20000]
                }

In [81]:
grid_search(lr, parameter_grid_lr)
lr = linear_model.LogisticRegression(C=1.0, class_weight='balanced', max_iter=300)
cv_train_model(lr)

Best score: 0.9330196158349308
Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 300}
Cross Validation mean score: 0.9324015413998885
Score on test data: 0.9555


In [82]:
grid_search(rf, parameters_RandomForest)
rf = ensemble.RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=15)
cv_train_model(rf)

Best score: 0.9032599218411395
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 15}
Cross Validation mean score: 0.898468554738821
Score on test data: 0.9218


In [83]:
grid_search(sgd, parameters_SGD)
sgd = SGDClassifier(alpha=0.1, max_iter=15000, penalty='l2')
cv_train_model(sgd)

Best score: 0.9397463161567285
Best parameters: {'alpha': 0.1, 'max_iter': 15000, 'penalty': 'l2'}
Cross Validation mean score: 0.9419530456228178
Score on test data: 0.9472
