<a href="https://colab.research.google.com/github/Vakhranev/Compling/blob/master/%D0%94%D0%BE%D0%BC%D0%B0%D1%88%D0%BA%D0%B0_%E2%84%963.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
categories = ['comp.os.ms-windows.misc', 'misc.forsale', 'rec.autos', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
vectorizer = CountVectorizer()
vect = vectorizer.fit_transform(train.data)
t_vect = vectorizer.fit_transform(test.data)

In [0]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from statistics import mean

X = vect
y = train.target
X, X_test, y, y_test = train_test_split(X, y, random_state=42)
stratified_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [0]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.linear_model import SGDClassifier
lr = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
sgd = SGDClassifier()
parameter_grid_lr = {'class_weight' : ['balanced', None],
                  'C' : [0.01, 0.1, 1.0, 10.0],
                  'max_iter': [300,1000]
                 }
parameters_RandomForest = {'n_estimators': [5, 10, 15],
                           'max_depth': [None, 5],
                           'min_samples_split': [0.5, 1.0, 5]                          
                }
parameters_SGD = {'penalty': ['l2', 'l1'], 
                  'alpha': [0.0001, 0.001, 0.01, 0.1], 
                  'max_iter': [10000, 15000, 20000]
                }

In [0]:
from sklearn.model_selection import GridSearchCV
def grid_search(model, parameter_grid):
    grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=stratified_folds, scoring='f1_macro')
    grid_search.fit(X, y)
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [22]:
grid_search(lr, parameter_grid_lr)

Best score: 0.9342816081032079
Best parameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 300}


In [23]:
grid_search(rf, parameters_RandomForest)

Best score: 0.8502862697811426
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 15}


In [24]:
grid_search(sgd, parameters_SGD)

Best score: 0.9442028847338488
Best parameters: {'alpha': 0.1, 'max_iter': 15000, 'penalty': 'l2'}


In [0]:
lr = linear_model.LogisticRegression(C=1.0, class_weight='balanced', max_iter=300)
rf = ensemble.RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=15)
sgd = SGDClassifier(alpha=0.1, max_iter=15000, penalty='l2')

In [0]:
def cv_train_model(model):
    valid_scores = []
    for fold_n, (train_index, valid_index) in enumerate(stratified_folds.split(X, y)):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        model.fit(X_train, y_train)      
        y_pred_test = model.predict(X_test)
        y_pred_valid = model.predict(X_valid)
        valid_score = sklearn.metrics.f1_score(y_valid, y_pred_valid, average='macro')
        valid_scores.append(valid_score)
    test_score = sklearn.metrics.f1_score(y_test, y_pred_test, average='macro')
    print('Cross Validation mean score:', mean(valid_scores))
    print("Score on test data: {0:.4f}".format(test_score))

In [27]:
cv_train_model(lr)

Cross Validation mean score: 0.934281608103208
Score on test data: 0.9469


In [28]:
cv_train_model(rf)

Cross Validation mean score: 0.8492182100374583
Score on test data: 0.8560


In [29]:
cv_train_model(sgd)

Cross Validation mean score: 0.9397395907518953
Score on test data: 0.9589


In [30]:
!pip install eli5



In [0]:
lr_fit = lr.fit(X,y)
rf_fit = rf.fit(X,y)
sgd_fit = sgd.fit(X,y)

In [0]:
import eli5  
import pandas as pd
def analyze_features(model, n):
  count_vect = CountVectorizer()
  index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
  df = eli5.formatters.as_dataframe.explain_weights_df(model)
  category1 = df[df['target']==0]
  category2 = df[df['target']==1]
  category3 = df[df['target']==2]
  category4 = df[df['target']==3]
  print('comp.os.ms-windows.misc:')
  for word in category1.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])
  print('misc.forsale:')
  for word in category2.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])
  print('rec.autos')
  for word in category3.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])
  print('sci.space:')
  for word in category4.feature[:n]:
    #print(word)
    priznak = word.strip('x')
    print(index_to_word[int(priznak)])

In [72]:
analyze_features(lr_fit, 10)

comp.os.ms-windows.misc:


KeyError: ignored