In [None]:
! pip install eli5
import eli5

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re

In [None]:
import certifi
import ssl
import urllib.request
from sklearn.datasets import fetch_20newsgroups

# Set the SSL certificate bundle path to the latest version
ssl_context = ssl.create_default_context(cafile=certifi.where())
urllib.request.install_opener(urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context)))

In [None]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")

In [None]:
train.data[95]

"From: coburnn@spot.Colorado.EDU (Nicholas S. Coburn)\nSubject: Re: bikes with big dogs\nNntp-Posting-Host: spot.colorado.edu\nOrganization: University of Colorado, Boulder\nLines: 19\n\nIn article <1993Apr14.234835.1@cua.edu> 84wendel@cua.edu writes:\n>Has anyone ever heard of a rider giving a big dog such as a great dane a ride \n>on the back of his bike.  My dog would love it if I could ever make it work.\n>\tThanks\n>\t\t\t84wendel@cua.edu\n>\n\nOn the back might be tricky, but here in Boulder, there is a guy \nthat can always be seen with his Golden Retriever in the sidecar.\nOf course, the dog is always wearing WWII style goggles (no joke)\n\n\n________________________________________________________________________\nNick Coburn                     DoD#6425                      AMA#679817\n                  '88CBR1000              '89CBR600\n                       coburnn@spot.colorado.edu\n________________________________________________________________________\n\n\n"

In [None]:
len(train.data)

11314

# pre-processing

In [None]:
import numpy as np
from sklearn.utils import Bunch

In [None]:
train_1 = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
test_1 = fetch_20newsgroups(subset="test", remove=('headers', 'footers', 'quotes'))

In [None]:
train_2 = fetch_20newsgroups(remove=( 'footers', 'quotes'))
test_2 = fetch_20newsgroups(subset="test", remove=('footers', 'quotes'))

In [None]:
train_3 = fetch_20newsgroups(remove=( 'footers'))
test_3 = fetch_20newsgroups(subset="test", remove=('footers'))

In [None]:
def clean_text(news_data):
    """Cleans some issues with the text data
    Args:
        news_data: list of text strings
    Returns:
        A cleaned version of the input data
    """
    cleaned_text = []
    for text in news_data.data:
        x = re.sub('[^\w]|_', ' ', text)  # only keep numbers and letters and spaces
        x = x.lower()
        x = re.sub(r'[^\x00-\x7f]',r'', x)  # remove non ascii texts
        x = ' '.join([y for y in x.split(' ') if y]) # remove empty words
        x = re.sub(r'\b\d+\b', '[number]', x) # convert all numbers to '[number]' to reduce vocab size.
        cleaned_text.append(x)

    return Bunch(
        data=np.array(cleaned_text),
        target=news_data.target,
        target_names=news_data.target_names,
        DESCR=news_data.DESCR,
        filenames=news_data.filenames,
    )

In [None]:
cleaned_train = clean_text(train)
cleaned_test =  clean_text(test)

In [None]:
cleaned_train_1 = clean_text(train_1)
cleaned_test_1 =  clean_text(test_1)

In [None]:
cleaned_train_2 = clean_text(train_2)
cleaned_test_2 =  clean_text(test_2)

In [None]:
cleaned_train_3 = clean_text(train_3)
cleaned_test_3 =  clean_text(test_3)

In [None]:
train_1.data[95]

'\nOn the back might be tricky, but here in Boulder, there is a guy \nthat can always be seen with his Golden Retriever in the sidecar.\nOf course, the dog is always wearing WWII style goggles (no joke)\n'

In [None]:
train_2.data[95]

'From: coburnn@spot.Colorado.EDU (Nicholas S. Coburn)\nSubject: Re: bikes with big dogs\nNntp-Posting-Host: spot.colorado.edu\nOrganization: University of Colorado, Boulder\nLines: 19\n\n\nOn the back might be tricky, but here in Boulder, there is a guy \nthat can always be seen with his Golden Retriever in the sidecar.\nOf course, the dog is always wearing WWII style goggles (no joke)\n'

In [None]:
cleaned_train.data[95]

In [None]:
cleaned_train_1.data[95]

'on the back might be tricky but here in boulder there is a guy that can always be seen with his golden retriever in the sidecar of course the dog is always wearing wwii style goggles no joke'

In [None]:
cleaned_train_2.data[95]

'from coburnn spot colorado edu nicholas s coburn subject re bikes with big dogs nntp posting host spot colorado edu organization university of colorado boulder lines [number] on the back might be tricky but here in boulder there is a guy that can always be seen with his golden retriever in the sidecar of course the dog is always wearing wwii style goggles no joke'

перевіряю на бейзлайні вплив різних типів препроцесингу даних

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression(C=1)),
])

In [None]:
pipeline.fit(train["data"], train["target"])
pipeline.fit(train_1["data"], train_1["target"])
pipeline.fit(train_2["data"], train_2["target"])
pipeline.fit(train_3["data"], train_3["target"])
pipeline.fit(cleaned_train["data"], cleaned_train["target"])
pipeline.fit(cleaned_train_1["data"], cleaned_train_1["target"])
pipeline.fit(cleaned_train_2["data"], cleaned_train_2["target"])

In [None]:
predictions = pipeline.predict(test["data"])
predictions_1 = pipeline.predict(test_1["data"])
predictions_2 = pipeline.predict(test_2["data"])
predictions_3 = pipeline.predict(cleaned_test["data"])
predictions_4 = pipeline.predict(cleaned_test_1["data"])
predictions_5 = pipeline.predict(cleaned_test_2["data"])
predictions_6 = pipeline.predict(test_3["data"])


In [None]:
scores = [
    accuracy_score(test["target"], predictions),
    accuracy_score(test_1["target"], predictions_1),
    accuracy_score(test_2["target"], predictions_2),
    accuracy_score(test_3["target"], predictions_6),
    accuracy_score(cleaned_test["target"], predictions_3),
    accuracy_score(cleaned_test_1["target"], predictions_4),
    accuracy_score(cleaned_test_2["target"], predictions_5)

]

In [None]:
df = pd.DataFrame({"Test Set": ["Test", "Test 1 (without h,f,q)", "Test 2 (without f,q)", "Test 3 (without f)", "Cleaned Test", "Cleaned Test 1 (without h,f,q)", "Cleaned Test 2 (without f,q"], "Accuracy Score": scores})

In [None]:
df

Unnamed: 0,Test Set,Accuracy Score
0,Test,0.729687
1,"Test 1 (without h,f,q)",0.545273
2,"Test 2 (without f,q)",0.674588
3,Test 3 (without f),0.719331
4,Cleaned Test,0.735661
5,"Cleaned Test 1 (without h,f,q)",0.546999
6,"Cleaned Test 2 (without f,q",0.679501


Висновки:
- наявність елементів ‘headers’ ‘footers’ ‘quotes’ сприяє кращому результату класифікації (за результатами експерименту, є сенс перевірити дані без футерів)
- застосування "технічних" (def clean_text) поправок до даних дали мінімальні покращення на рівні сотих

# experiments with different models and hyperparameters

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression())
])

params = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__max_features': [5000, 10000],
    'vect__min_df': [1, 2, 5],
    'vect__max_df': [1.0, 0.8, 0.5],
    'clf__C': [10, 1, 0.1, 0.01]
}
grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [None]:
grid_search.fit(train["data"], train["target"])

In [None]:
grid_search.best_score_, grid_search.best_estimator_

(0.8542509068340657,
 Pipeline(steps=[('vect',
                  CountVectorizer(max_df=0.5, max_features=10000, min_df=5)),
                 ('clf', LogisticRegression(C=0.1))]))

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, max_features=10000, min_df=5)),
    ('clf', LogisticRegression(C=0.1)),
])
pipeline.fit(train["data"], train["target"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.780801911842804

In [None]:
predictions = pipeline.predict(cleaned_test["data"])
accuracy_score(cleaned_test["target"], predictions)

0.7092405735528412

In [None]:
predictions = pipeline.predict(test_3["data"])
accuracy_score(test_3["target"], predictions)

0.7618162506638343

In [None]:
report = classification_report(test["target"], predictions, output_dict=True)

In [None]:
results = {
        'Vectorizer': 'vect',
        'Hyperparameters': grid_search.best_params_,
        'Algorithm': 'clf',
        'Cross-validation score': grid_search.best_score_,
        'Test score': accuracy_score,
        'Classification report': report

    }

In [None]:
pipeline_1 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

params_1 = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_features': [1000, 5000, 10000],
    'vect__min_df': [1, 2, 5],
    'vect__max_df': [1, 0.8, 0.5],
    'clf__C': [10, 1, 0.1, 0.01]
}


grid_search_1 = GridSearchCV(pipeline_1, params_1, scoring="accuracy", cv=skf, n_jobs=-1)

In [None]:
grid_search_1.fit(train["data"], train["target"])
grid_search_1.best_score_, grid_search_1.best_estimator_

In [None]:
grid_search_1.best_score_, grid_search_1.best_estimator_

(0.8908427405328324,
 Pipeline(steps=[('vect', TfidfVectorizer(max_df=0.5, max_features=10000)),
                 ('clf', LogisticRegression(C=10))]))

In [None]:
pipeline_1 = Pipeline([
    ('vect', TfidfVectorizer(max_df=0.5, max_features=10000)),
    ('clf', LogisticRegression(C=10, max_iter = 1000)),
])
pipeline_1.fit(train["data"], train["target"])

In [None]:
predictions_1 = pipeline_1.predict(test["data"])
accuracy_score_1 = accuracy_score(test["target"], predictions_1)
accuracy_score(test["target"], predictions_1)

0.8210302708443973

In [None]:
eli5.show_prediction(pipeline_1[1], test["data"][1500],  vec=pipeline_1[0],
feature_names=pipeline_1[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
-0.478,Highlighted in text (sum)
-0.884,<BIAS>

Contribution?,Feature
0.463,<BIAS>
-1.136,Highlighted in text (sum)

Contribution?,Feature
0.348,<BIAS>
-1.094,Highlighted in text (sum)

Contribution?,Feature
0.199,<BIAS>
-1.52,Highlighted in text (sum)

Contribution?,Feature
0.156,<BIAS>
-1.32,Highlighted in text (sum)

Contribution?,Feature
0.55,<BIAS>
-1.145,Highlighted in text (sum)

Contribution?,Feature
0.647,<BIAS>
-0.074,Highlighted in text (sum)

Contribution?,Feature
0.583,Highlighted in text (sum)
0.025,<BIAS>

Contribution?,Feature
0.049,Highlighted in text (sum)
-0.067,<BIAS>

Contribution?,Feature
0.385,<BIAS>
-0.45,Highlighted in text (sum)

Contribution?,Feature
0.228,<BIAS>
-0.419,Highlighted in text (sum)

Contribution?,Feature
-0.245,Highlighted in text (sum)
-0.733,<BIAS>

Contribution?,Feature
0.455,<BIAS>
-1.032,Highlighted in text (sum)

Contribution?,Feature
0.05,<BIAS>
-0.831,Highlighted in text (sum)

Contribution?,Feature
1.021,Highlighted in text (sum)
0.053,<BIAS>

Contribution?,Feature
0.003,<BIAS>
-0.611,Highlighted in text (sum)

Contribution?,Feature
-0.485,<BIAS>
-0.715,Highlighted in text (sum)

Contribution?,Feature
0.594,Highlighted in text (sum)
-0.283,<BIAS>

Contribution?,Feature
7.273,Highlighted in text (sum)
-0.537,<BIAS>

Contribution?,Feature
1.551,Highlighted in text (sum)
-0.571,<BIAS>


In [None]:
report_1 = classification_report(test["target"], predictions_1, output_dict=True)

In [None]:
results_1 = {
        'Vectorizer': 'TfidfVectorizer',
        'Hyperparameters': grid_search_1.best_params_,
        'Algorithm': 'LogisticRegression',
        'Cross-validation score': grid_search_1.best_score_,
        'Test score': accuracy_score_1,
        'Classification report': report_1

    }

In [None]:
pipeline_2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', GradientBoostingClassifier())
])

params_2 = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_features': [1000, 5000, 10000],
    'vect__min_df': [1, 2, 5],
    'vect__max_df': [1, 0.8, 0.5],
    'clf__learning_rate': [0.01, 0.1, 0.5],
    'clf__n_estimators': [50, 100]

}


grid_search_2 = GridSearchCV(pipeline_2, params_2, scoring="accuracy", cv=skf, n_jobs=-1)

In [None]:
grid_search_2.fit(train["data"], train["target"])

In [None]:
grid_search_2.best_score_, grid_search_2.best_estimator_

(0.8064354402666852,
 Pipeline(steps=[('vect',
                  TfidfVectorizer(max_df=0.8, max_features=10000, min_df=2)),
                 ('clf', GradientBoostingClassifier())]))

In [None]:
pipeline_2 = Pipeline([
    ('vect', TfidfVectorizer(max_df=0.8, max_features=10000, min_df=2)),
    ('clf', GradientBoostingClassifier()),
])
pipeline_2.fit(train["data"], train["target"])

In [None]:
predictions_2 = pipeline_2.predict(test["data"])
accuracy_score_2 = accuracy_score(test["target"], predictions_2)
print(accuracy_score_2)

0.747344662772172


In [None]:
report_2 = classification_report(test["target"], predictions_2, output_dict=True)

In [None]:
report_2

{'0': {'precision': 0.7786259541984732,
  'recall': 0.6394984326018809,
  'f1-score': 0.7022375215146299,
  'support': 319},
 '1': {'precision': 0.6997455470737913,
  'recall': 0.7069408740359897,
  'f1-score': 0.7033248081841432,
  'support': 389},
 '2': {'precision': 0.6965174129353234,
  'recall': 0.7106598984771574,
  'f1-score': 0.7035175879396985,
  'support': 394},
 '3': {'precision': 0.6162528216704289,
  'recall': 0.6964285714285714,
  'f1-score': 0.6538922155688622,
  'support': 392},
 '4': {'precision': 0.7377892030848329,
  'recall': 0.7454545454545455,
  'f1-score': 0.7416020671834624,
  'support': 385},
 '5': {'precision': 0.7955974842767296,
  'recall': 0.640506329113924,
  'f1-score': 0.7096774193548387,
  'support': 395},
 '6': {'precision': 0.7908653846153846,
  'recall': 0.8435897435897436,
  'f1-score': 0.8163771712158808,
  'support': 390},
 '7': {'precision': 0.84375,
  'recall': 0.75,
  'f1-score': 0.7941176470588235,
  'support': 396},
 '8': {'precision': 0.8837

In [None]:
results_2 = {
        'Vectorizer': 'TfidfVectorizer',
        'Hyperparameters': grid_search_2.best_params_,
        'Algorithm': 'GradientBoostingClassifier',
        'Cross-validation score': grid_search_2.best_score_,
        'Test score': accuracy_score_2,
        'Classification report': report_2

    }

Логістична регресія виявилась більш ефективним та швидшим (за часовими затратами) методом.
Логістична регресія менш схильна до перенавчання на невеликих наборах даних (в аналізованому - близько 20 000 дописів (11314 в трейні і приблизно 7700 в тесті)).
Також гірша результативність бустингу може бути повʼязана із розрідженістю даних та великою кількістю фіч.
бустинг не найкращий варіант для роботи з високорозмірними даними як тексти, зображення й звук.


In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
pipeline_3= Pipeline([
    ('features', FeatureUnion([
        ('word', TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features = 30000, max_df = 0.2, min_df = 1)),
        ('char', TfidfVectorizer(analyzer='char', ngram_range=(3,7), max_features = 30000,max_df = 0.2, min_df = 1))
    ])),
    ('clf', LogisticRegression(C=10, max_iter = 1000))
])


In [None]:
pipeline_3.fit(train['data'], train['target'])

In [None]:
predictions_3 = pipeline_3.predict(test["data"])
accuracy_score(test["target"], predictions_3)

0.8266064790228359

In [None]:
eli5.show_prediction(pipeline_3[1], test["data"][1500], top=100, vec=pipeline_3[0],
feature_names=pipeline_3[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
+0.067,char: Highlighted in text (sum)
… 455 more positive …,… 455 more positive …
… 345 more negative …,… 345 more negative …
-0.375,word: Highlighted in text (sum)
-0.858,<BIAS>

Contribution?,Feature
+0.476,<BIAS>
… 262 more positive …,… 262 more positive …
… 538 more negative …,… 538 more negative …
-0.197,word: Highlighted in text (sum)
-0.388,char: Highlighted in text (sum)

Contribution?,Feature
+0.345,<BIAS>
… 294 more positive …,… 294 more positive …
… 506 more negative …,… 506 more negative …
-0.327,char: Highlighted in text (sum)
-0.487,word: Highlighted in text (sum)

Contribution?,Feature
+0.277,<BIAS>
… 301 more positive …,… 301 more positive …
… 499 more negative …,… 499 more negative …
-0.533,word: Highlighted in text (sum)
-0.709,char: Highlighted in text (sum)

Contribution?,Feature
+0.324,<BIAS>
… 300 more positive …,… 300 more positive …
… 500 more negative …,… 500 more negative …
-0.367,char: Highlighted in text (sum)
-0.463,word: Highlighted in text (sum)

Contribution?,Feature
+0.478,<BIAS>
+0.104,char: Highlighted in text (sum)
… 308 more positive …,… 308 more positive …
… 492 more negative …,… 492 more negative …
-0.467,word: Highlighted in text (sum)

Contribution?,Feature
+0.858,char: Highlighted in text (sum)
+0.492,<BIAS>
+0.319,word: Highlighted in text (sum)
… 299 more positive …,… 299 more positive …
… 501 more negative …,… 501 more negative …

Contribution?,Feature
+0.340,word: Highlighted in text (sum)
+0.319,<BIAS>
+0.107,char: Highlighted in text (sum)
… 330 more positive …,… 330 more positive …
… 470 more negative …,… 470 more negative …

Contribution?,Feature
+0.136,<BIAS>
+0.049,word: Highlighted in text (sum)
… 298 more positive …,… 298 more positive …
… 502 more negative …,… 502 more negative …
-0.310,char: Highlighted in text (sum)

Contribution?,Feature
+0.421,<BIAS>
… 288 more positive …,… 288 more positive …
… 512 more negative …,… 512 more negative …
-0.170,word: Highlighted in text (sum)
-0.198,char: Highlighted in text (sum)

Contribution?,Feature
+0.144,<BIAS>
+0.105,char: Highlighted in text (sum)
… 308 more positive …,… 308 more positive …
… 492 more negative …,… 492 more negative …
-0.180,word: Highlighted in text (sum)

Contribution?,Feature
… 321 more positive …,… 321 more positive …
… 479 more negative …,… 479 more negative …
-0.029,word: Highlighted in text (sum)
-0.255,char: Highlighted in text (sum)
-0.737,<BIAS>

Contribution?,Feature
+0.355,<BIAS>
+0.019,char: Highlighted in text (sum)
… 279 more positive …,… 279 more positive …
… 521 more negative …,… 521 more negative …
-0.411,word: Highlighted in text (sum)

Contribution?,Feature
… 364 more positive …,… 364 more positive …
… 436 more negative …,… 436 more negative …
-0.050,<BIAS>
-0.352,char: Highlighted in text (sum)
-0.412,word: Highlighted in text (sum)

Contribution?,Feature
+0.553,word: Highlighted in text (sum)
+0.322,char: Highlighted in text (sum)
… 366 more positive …,… 366 more positive …
… 434 more negative …,… 434 more negative …
-0.229,<BIAS>

Contribution?,Feature
+0.787,char: Highlighted in text (sum)
+0.102,<BIAS>
… 373 more positive …,… 373 more positive …
… 427 more negative …,… 427 more negative …
-0.169,word: Highlighted in text (sum)

Contribution?,Feature
… 337 more positive …,… 337 more positive …
… 463 more negative …,… 463 more negative …
-0.333,<BIAS>
-0.564,word: Highlighted in text (sum)
-0.876,char: Highlighted in text (sum)

Contribution?,Feature
+0.546,char: Highlighted in text (sum)
+0.151,word: Highlighted in text (sum)
… 404 more positive …,… 404 more positive …
… 396 more negative …,… 396 more negative …
-0.394,<BIAS>

Contribution?,Feature
+4.380,word: Highlighted in text (sum)
+2.374,char: Highlighted in text (sum)
… 534 more positive …,… 534 more positive …
… 266 more negative …,… 266 more negative …
-0.681,<BIAS>

Contribution?,Feature
+2.275,char: Highlighted in text (sum)
+0.503,word: Highlighted in text (sum)
… 376 more positive …,… 376 more positive …
… 424 more negative …,… 424 more negative …
-0.589,<BIAS>


In [None]:
pipeline_3.fit(cleaned_train['data'], cleaned_train['target'])

In [None]:
predictions_4 = pipeline_3.predict(cleaned_test
["data"])
accuracy_score(cleaned_test["target"], predictions_4)

0.8311205523101434

In [None]:
eli5.explain_weights(pipeline_3[1], feature_names=pipeline_3[0].get_feature_names_out(), target_names=test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+2.565,word__keith,,,,,,,,,,,,,,,,,,
+2.546,word__atheism,,,,,,,,,,,,,,,,,,
+2.216,char__theis,,,,,,,,,,,,,,,,,,
+2.209,char__heis,,,,,,,,,,,,,,,,,,
+2.151,word__enviroleague,,,,,,,,,,,,,,,,,,
+2.117,word__god,,,,,,,,,,,,,,,,,,
+2.076,word__rushdie,,,,,,,,,,,,,,,,,,
+2.072,word__islamic,,,,,,,,,,,,,,,,,,
+2.031,word__mathew,,,,,,,,,,,,,,,,,,
+2.001,word__wingate,,,,,,,,,,,,,,,,,,

Weight?,Feature
+2.565,word__keith
+2.546,word__atheism
+2.216,char__theis
+2.209,char__heis
+2.151,word__enviroleague
+2.117,word__god
+2.076,word__rushdie
+2.072,word__islamic
+2.031,word__mathew
+2.001,word__wingate

Weight?,Feature
+3.778,word__3d
+3.451,word__graphics
+3.317,word__pov
+3.199,word__tiff
+3.142,word__3do
+3.059,char__ 3d
+3.030,word__cview
+2.847,word__image
+2.778,word__polygon
+2.681,word__mpeg

Weight?,Feature
+5.637,word__windows
+4.321,char__ win
+3.091,word__cica
+2.715,word__file
+2.470,char__ndows
+2.470,char__indows
+2.470,char__windows
+2.470,char__indows
+2.470,char__ndows
+2.431,char__dows

Weight?,Feature
+3.326,word__gateway
+3.301,word__pc
+2.833,word__card
+2.739,word__ide
+2.422,word__vlb
+2.411,word__port
+2.364,word__scsi
+2.271,char__os
+2.248,word__cpu
+2.157,word__floppy

Weight?,Feature
+6.751,word__mac
+4.644,word__apple
+4.092,char__ mac
+3.940,word__quadra
+3.934,word__duo
+3.924,char__mac
+3.606,word__powerbook
+3.521,word__centris
+3.434,word__se
+3.176,char__mac

Weight?,Feature
+5.061,char__ x
+3.994,char__e x
+3.955,word__motif
+3.304,word__window
+3.054,word__server
+2.947,char__ xt
+2.910,word__x11r5
+2.733,char__n x
+2.628,char__t x
+2.615,word__xterm

Weight?,Feature
+4.111,word__sale
+3.602,word__for sale
+2.986,word__forsale
+2.726,char__sale
+2.660,word__wanted
+2.474,word__sell
+2.400,char__sale
+2.369,word__shipping
+2.312,word__offer
+2.231,word__subject wanted

Weight?,Feature
+9.145,word__car
+5.108,word__cars
+4.201,char__ car
+4.047,char__car
+3.085,word__toyota
+2.857,word__integra
+2.809,word__ford
+2.562,char__ auto
+2.557,word__oil
+2.551,char__auto

Weight?,Feature
+5.637,word__dod
+5.507,word__bike
+4.420,word__dod number
+4.160,char__bik
+4.082,char__bike
+4.035,char__ bik
+4.033,char__ bike
+3.721,word__bmw
+3.378,word__bikes
+2.824,char__ rid

Weight?,Feature
+3.203,word__phillies
+3.052,word__baseball
+3.043,word__cubs
+2.507,word__stadium
+2.440,word__sox
+2.406,word__tigers
+2.373,char__ball
+2.347,char__bal
+2.185,word__uniforms
+2.162,word__team

Weight?,Feature
+4.423,word__hockey
+3.472,word__nhl
+3.167,char__ play
+3.002,word__team
+2.818,char__play
+2.720,char__lay
+2.678,word__game
+2.590,word__espn
+2.573,word__playoff
+2.489,word__ca

Weight?,Feature
+3.967,word__key
+3.433,word__clipper
+3.024,char__cryp
+3.020,char__crypt
+2.960,char__rypt
+2.935,char__ryp
+2.848,char__cry
+2.832,char__ypt
+2.417,word__pgp
+2.156,word__nsa

Weight?,Feature
+2.840,word__tv
+2.491,word__circuit
+2.461,word__electronics
+2.439,word__radar
+2.419,word__scope
+2.384,word__ee
+2.327,word__power
+2.266,word__voltage
+2.228,word__motorola
+2.134,word__pcb

Weight?,Feature
+3.741,word__msg
+3.099,word__doctor
+3.046,word__disease
+2.737,word__cancer
+2.300,word__dyer
+2.247,word__treatment
+2.206,word__photography
+2.047,word__pain
+1.954,word__eye
+1.952,word__medical

Weight?,Feature
+5.903,word__space
+3.348,char__space
+3.290,char__pace
+3.289,char__spac
+3.257,word__orbit
+3.235,char__ space
+3.179,char__ spac
+2.943,word__dc
+2.884,word__moon
+2.775,char__space

Weight?,Feature
+3.648,word__god
+3.292,word__clh
+2.568,word__church
+2.412,word__rutgers edu
+2.272,word__hell
+2.232,word__number athos
+2.201,word__easter
+2.195,word__athos rutgers
+2.195,word__athos
+2.192,word__petch

Weight?,Feature
+4.696,word__gun
+4.611,char__gun
+3.833,char__ gun
+3.656,word__guns
+3.090,word__waco
+2.694,char__gun
+2.427,word__atf
+2.378,char__guns
+2.369,word__fbi
+2.216,char__ gun

Weight?,Feature
+3.003,word__israel
+2.781,word__israeli
+2.431,char__ isr
+2.399,char__ isra
+2.397,char__ israe
+2.397,char__ israel
+2.395,char__srae
+2.395,char__israe
+2.395,char__srael
+2.395,char__israel

Weight?,Feature
+2.913,word__kaldis
+2.495,word__clinton
+2.489,word__tax
+2.346,word__drieux
+2.316,word__top ten
+2.288,word__drugs
+2.202,word__gay
+2.185,word__optilink
+2.114,word__cramer
+1.817,word__wetware

Weight?,Feature
+3.091,word__beast
+2.726,word__koresh
+2.466,word__morality
+2.287,word__hudson
+2.279,word__god
+2.231,word__the beast
+2.218,word__tyre
+2.208,word__christian
+2.164,word__sandvik
+2.090,word__thyagi


In [None]:
eli5.show_prediction(pipeline_3[1], cleaned_test["data"][1500], top = 100, vec=pipeline_3[0],
feature_names=pipeline_3[0].get_feature_names_out(), target_names=cleaned_test.target_names)

Contribution?,Feature
… 429 more positive …,… 429 more positive …
… 334 more negative …,… 334 more negative …
-0.101,char: Highlighted in text (sum)
-0.426,word: Highlighted in text (sum)
-0.858,<BIAS>

Contribution?,Feature
+0.476,<BIAS>
… 273 more positive …,… 273 more positive …
… 490 more negative …,… 490 more negative …
-0.106,word: Highlighted in text (sum)
-0.240,char: Highlighted in text (sum)

Contribution?,Feature
+0.345,<BIAS>
… 261 more positive …,… 261 more positive …
… 502 more negative …,… 502 more negative …
-0.160,char: Highlighted in text (sum)
-0.354,word: Highlighted in text (sum)

Contribution?,Feature
+0.277,<BIAS>
… 275 more positive …,… 275 more positive …
… 488 more negative …,… 488 more negative …
-0.500,char: Highlighted in text (sum)
-0.537,word: Highlighted in text (sum)

Contribution?,Feature
+0.324,<BIAS>
… 292 more positive …,… 292 more positive …
… 471 more negative …,… 471 more negative …
-0.259,word: Highlighted in text (sum)
-0.328,char: Highlighted in text (sum)

Contribution?,Feature
+0.478,<BIAS>
… 289 more positive …,… 289 more positive …
… 474 more negative …,… 474 more negative …
-0.033,char: Highlighted in text (sum)
-0.515,word: Highlighted in text (sum)

Contribution?,Feature
+0.721,char: Highlighted in text (sum)
+0.492,<BIAS>
+0.153,word: Highlighted in text (sum)
… 296 more positive …,… 296 more positive …
… 467 more negative …,… 467 more negative …

Contribution?,Feature
+0.319,<BIAS>
+0.195,word: Highlighted in text (sum)
+0.077,char: Highlighted in text (sum)
… 298 more positive …,… 298 more positive …
… 465 more negative …,… 465 more negative …

Contribution?,Feature
+0.136,<BIAS>
+0.042,word: Highlighted in text (sum)
… 282 more positive …,… 282 more positive …
… 481 more negative …,… 481 more negative …
-0.444,char: Highlighted in text (sum)

Contribution?,Feature
+0.421,<BIAS>
… 277 more positive …,… 277 more positive …
… 486 more negative …,… 486 more negative …
-0.185,char: Highlighted in text (sum)
-0.343,word: Highlighted in text (sum)

Contribution?,Feature
+0.144,<BIAS>
… 282 more positive …,… 282 more positive …
… 481 more negative …,… 481 more negative …
-0.100,char: Highlighted in text (sum)
-0.414,word: Highlighted in text (sum)

Contribution?,Feature
+0.095,word: Highlighted in text (sum)
… 318 more positive …,… 318 more positive …
… 445 more negative …,… 445 more negative …
-0.118,char: Highlighted in text (sum)
-0.737,<BIAS>

Contribution?,Feature
+0.355,<BIAS>
+0.346,char: Highlighted in text (sum)
+0.064,word: Highlighted in text (sum)
… 257 more positive …,… 257 more positive …
… 506 more negative …,… 506 more negative …

Contribution?,Feature
… 358 more positive …,… 358 more positive …
… 405 more negative …,… 405 more negative …
-0.050,<BIAS>
-0.253,char: Highlighted in text (sum)
-0.288,word: Highlighted in text (sum)

Contribution?,Feature
+0.237,word: Highlighted in text (sum)
+0.138,char: Highlighted in text (sum)
… 341 more positive …,… 341 more positive …
… 422 more negative …,… 422 more negative …
-0.229,<BIAS>

Contribution?,Feature
+0.634,char: Highlighted in text (sum)
+0.102,<BIAS>
… 352 more positive …,… 352 more positive …
… 411 more negative …,… 411 more negative …
-0.231,word: Highlighted in text (sum)

Contribution?,Feature
… 308 more positive …,… 308 more positive …
… 455 more negative …,… 455 more negative …
-0.333,<BIAS>
-0.340,word: Highlighted in text (sum)
-0.732,char: Highlighted in text (sum)

Contribution?,Feature
+0.307,char: Highlighted in text (sum)
+0.132,word: Highlighted in text (sum)
… 401 more positive …,… 401 more positive …
… 362 more negative …,… 362 more negative …
-0.394,<BIAS>

Contribution?,Feature
+3.994,word: Highlighted in text (sum)
+2.365,char: Highlighted in text (sum)
… 498 more positive …,… 498 more positive …
… 265 more negative …,… 265 more negative …
-0.681,<BIAS>

Contribution?,Feature
+2.160,char: Highlighted in text (sum)
+0.537,word: Highlighted in text (sum)
… 359 more positive …,… 359 more positive …
… 404 more negative …,… 404 more negative …
-0.589,<BIAS>


In [None]:
pipeline_4 = Pipeline([
    ('features', FeatureUnion([
        ('word', TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features = 40000, max_df = 0.95, min_df = 1)),
        ('char', TfidfVectorizer(analyzer='char', ngram_range=(3,7), max_features = 40000,max_df = 0.95, min_df = 1))
    ])),
    ('clf', LogisticRegression(C=10, max_iter = 1000))
])


In [None]:
pipeline_4.fit(train['data'], train['target'])

In [None]:
# max_features = 40000, max_df = 0.95, min_df = 1
predictions_9 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_9)

0.8276686139139671

In [None]:
eli5.show_prediction(pipeline_4[1], test["data"][1500], top = 100, vec=pipeline_4[0],
feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

In [None]:
eli5.show_prediction(pipeline_4[1], test["data"][1500], top = 100, vec=pipeline_4[0],
feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
+0.078,word: Highlighted in text (sum)
… 833 more positive …,… 833 more positive …
… 668 more negative …,… 668 more negative …
-0.101,char: Highlighted in text (sum)
-1.160,<BIAS>

Contribution?,Feature
+0.688,<BIAS>
… 497 more positive …,… 497 more positive …
… 1004 more negative …,… 1004 more negative …
-0.180,word: Highlighted in text (sum)
-0.260,char: Highlighted in text (sum)

Contribution?,Feature
+0.558,<BIAS>
… 572 more positive …,… 572 more positive …
… 929 more negative …,… 929 more negative …
-0.275,char: Highlighted in text (sum)
-0.537,word: Highlighted in text (sum)

Contribution?,Feature
+0.417,<BIAS>
… 567 more positive …,… 567 more positive …
… 934 more negative …,… 934 more negative …
-0.524,char: Highlighted in text (sum)
-0.644,word: Highlighted in text (sum)

Contribution?,Feature
+0.346,<BIAS>
… 618 more positive …,… 618 more positive …
… 883 more negative …,… 883 more negative …
-0.202,char: Highlighted in text (sum)
-0.597,word: Highlighted in text (sum)

Contribution?,Feature
+0.800,<BIAS>
… 540 more positive …,… 540 more positive …
… 961 more negative …,… 961 more negative …
-0.040,char: Highlighted in text (sum)
-0.797,word: Highlighted in text (sum)

Contribution?,Feature
+0.735,<BIAS>
+0.390,char: Highlighted in text (sum)
… 614 more positive …,… 614 more positive …
… 887 more negative …,… 887 more negative …
-0.038,word: Highlighted in text (sum)

Contribution?,Feature
+0.407,word: Highlighted in text (sum)
+0.144,char: Highlighted in text (sum)
+0.127,<BIAS>
… 698 more positive …,… 698 more positive …
… 803 more negative …,… 803 more negative …

Contribution?,Feature
+0.129,word: Highlighted in text (sum)
… 658 more positive …,… 658 more positive …
… 843 more negative …,… 843 more negative …
-0.037,<BIAS>
-0.151,char: Highlighted in text (sum)

Contribution?,Feature
+0.506,<BIAS>
+0.063,char: Highlighted in text (sum)
… 642 more positive …,… 642 more positive …
… 859 more negative …,… 859 more negative …
-0.269,word: Highlighted in text (sum)

Contribution?,Feature
+0.437,<BIAS>
+0.127,char: Highlighted in text (sum)
… 604 more positive …,… 604 more positive …
… 897 more negative …,… 897 more negative …
-0.393,word: Highlighted in text (sum)

Contribution?,Feature
… 688 more positive …,… 688 more positive …
… 813 more negative …,… 813 more negative …
-0.020,word: Highlighted in text (sum)
-0.125,char: Highlighted in text (sum)
-0.872,<BIAS>

Contribution?,Feature
+0.398,<BIAS>
+0.111,char: Highlighted in text (sum)
… 557 more positive …,… 557 more positive …
… 944 more negative …,… 944 more negative …
-0.424,word: Highlighted in text (sum)

Contribution?,Feature
… 659 more positive …,… 659 more positive …
… 842 more negative …,… 842 more negative …
-0.180,<BIAS>
-0.186,char: Highlighted in text (sum)
-0.205,word: Highlighted in text (sum)

Contribution?,Feature
+0.518,word: Highlighted in text (sum)
+0.317,char: Highlighted in text (sum)
… 729 more positive …,… 729 more positive …
… 772 more negative …,… 772 more negative …
-0.313,<BIAS>

Contribution?,Feature
+0.611,char: Highlighted in text (sum)
+0.234,<BIAS>
+0.089,word: Highlighted in text (sum)
… 639 more positive …,… 639 more positive …
… 862 more negative …,… 862 more negative …

Contribution?,Feature
… 693 more positive …,… 693 more positive …
… 808 more negative …,… 808 more negative …
-0.433,word: Highlighted in text (sum)
-0.497,<BIAS>
-0.591,char: Highlighted in text (sum)

Contribution?,Feature
+0.646,char: Highlighted in text (sum)
+0.274,word: Highlighted in text (sum)
… 782 more positive …,… 782 more positive …
… 719 more negative …,… 719 more negative …
-0.421,<BIAS>

Contribution?,Feature
+4.357,word: Highlighted in text (sum)
+1.862,char: Highlighted in text (sum)
… 1032 more positive …,… 1032 more positive …
… 469 more negative …,… 469 more negative …
-0.987,<BIAS>

Contribution?,Feature
+1.879,char: Highlighted in text (sum)
+0.754,word: Highlighted in text (sum)
… 764 more positive …,… 764 more positive …
… 737 more negative …,… 737 more negative …
-0.783,<BIAS>


In [None]:
# max_features = 40000, max_df = 0.85, min_df = 1
predictions_8 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_8)

0.8287307488050982

In [None]:
eli5.explain_weights(pipeline_4[1], feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+3.262,word__atheism,,,,,,,,,,,,,,,,,,
+3.209,word__keith,,,,,,,,,,,,,,,,,,
+3.044,word__enviroleague,,,,,,,,,,,,,,,,,,
+2.762,word__god,,,,,,,,,,,,,,,,,,
+2.553,word__atheists,,,,,,,,,,,,,,,,,,
+2.399,word__islamic,,,,,,,,,,,,,,,,,,
+2.395,word__cobb,,,,,,,,,,,,,,,,,,
+2.268,word__mathew,,,,,,,,,,,,,,,,,,
+2.142,word__rushdie,,,,,,,,,,,,,,,,,,
+2.113,word__bible,,,,,,,,,,,,,,,,,,

Weight?,Feature
+3.262,word__atheism
+3.209,word__keith
+3.044,word__enviroleague
+2.762,word__god
+2.553,word__atheists
+2.399,word__islamic
+2.395,word__cobb
+2.268,word__mathew
+2.142,word__rushdie
+2.113,word__bible

Weight?,Feature
+5.950,word__graphics
+4.749,word__3d
+4.089,word__pov
+3.914,word__image
+3.569,word__tiff
+3.513,word__3do
+3.332,word__cview
+3.270,word__animation
+3.218,word__polygon
+3.060,word__images

Weight?,Feature
+9.866,word__windows
+3.995,word__file
+3.863,word__cica
+3.363,char__ win
+3.210,word__files
+3.005,word__win
+2.918,char__win
+2.870,word__nt
+2.829,word__win3
+2.826,word__for windows

Weight?,Feature
+4.412,word__pc
+3.682,word__scsi
+3.681,word__ide
+3.364,word__card
+3.303,word__drive
+3.274,word__486
+3.243,word__gateway
+3.098,word__disk
+2.966,word__bus
+2.852,word__controller

Weight?,Feature
+9.196,word__mac
+6.439,word__apple
+4.495,word__duo
+4.471,word__quadra
+4.302,word__powerbook
+3.872,word__lc
+3.739,word__se
+3.718,word__centris
+3.427,char__mac
+3.346,char__ mac

Weight?,Feature
+5.879,word__motif
+5.126,word__window
+4.026,word__server
+3.865,word__x11r5
+3.813,word__widget
+3.785,word__xterm
+3.345,char__ x
+2.812,word__sun
+2.812,word__x11
+2.701,char__: x

Weight?,Feature
+6.239,word__sale
+5.615,word__for sale
+3.819,word__forsale
+3.748,word__for
+3.218,word__00
+3.155,word__sell
+3.138,word__offer
+2.952,word__wanted
+2.827,word__shipping
+2.494,word__asking

Weight?,Feature
+11.117,word__car
+5.949,word__cars
+3.442,word__toyota
+3.137,word__ford
+2.997,word__engine
+2.955,word__integra
+2.939,word__the car
+2.676,word__oil
+2.546,char__ car
+2.472,char__car

Weight?,Feature
+9.591,word__dod
+7.879,word__bike
+4.823,word__bmw
+4.228,word__bikes
+3.596,word__ride
+3.564,char__bik
+3.486,char__bike
+3.379,char__ bik
+3.379,char__ bike
+3.213,word__riding

Weight?,Feature
+5.335,word__baseball
+3.626,word__he
+3.491,word__phillies
+3.265,word__cubs
+2.843,word__stadium
+2.833,word__team
+2.797,word__sox
+2.638,word__runs
+2.604,word__tigers
+2.360,word__year

Weight?,Feature
+6.743,word__hockey
+4.661,word__nhl
+4.301,word__team
+3.575,word__game
+3.053,word__ca
+2.968,word__season
+2.718,word__espn
+2.696,word__go
+2.664,word__playoff
+2.596,word__cup

Weight?,Feature
+5.058,word__key
+5.047,word__clipper
+3.290,word__pgp
+3.225,word__chip
+2.916,char__cryp
+2.912,char__crypt
+2.910,word__encryption
+2.870,char__rypt
+2.838,char__ryp
+2.759,char__ypt

Weight?,Feature
+3.780,word__electronics
+3.631,word__circuit
+3.115,word__voltage
+3.026,word__audio
+2.994,word__radar
+2.908,word__tv
+2.766,word__motorola
+2.753,word__power
+2.708,word__scope
+2.706,word__ee

Weight?,Feature
+4.443,word__msg
+4.269,word__doctor
+3.051,word__pitt
+2.962,word__cancer
+2.774,word__treatment
+2.767,word__pain
+2.717,word__medical
+2.693,word__disease
+2.597,word__pitt edu
+2.585,word__gordon banks

Weight?,Feature
+8.751,word__space
+3.937,word__orbit
+3.664,word__dc
+3.539,word__moon
+3.182,char__space
+3.133,char__spac
+3.132,char__pace
+2.707,word__nasa
+2.705,word__launch
+2.685,char__spa

Weight?,Feature
+4.871,word__god
+3.903,word__church
+3.184,word__clh
+3.018,word__rutgers edu
+2.788,word__christ
+2.705,word__article apr
+2.648,word__athos
+2.648,word__athos rutgers
+2.545,word__jesus
+2.530,word__rutgers

Weight?,Feature
+6.730,word__gun
+5.144,word__guns
+4.293,char__gun
+3.492,word__waco
+3.095,word__fbi
+3.060,word__atf
+2.842,word__firearms
+2.818,word__batf
+2.749,char__ gun
+2.681,word__nra

Weight?,Feature
+4.182,word__israel
+4.101,word__israeli
+2.651,word__fourd com
+2.651,word__fourd
+2.623,word__jews
+2.372,char__israe
+2.372,char__srae
+2.372,char__israel
+2.372,char__srael
+2.370,char__isra

Weight?,Feature
+3.550,word__clinton
+3.195,word__tax
+2.938,word__kaldis
+2.704,word__gay
+2.569,word__top ten
+2.545,word__drugs
+2.504,word__deane
+2.493,word__cramer
+2.408,word__jobs
+2.395,word__sexual

Weight?,Feature
+3.261,word__beast
+2.892,word__koresh
+2.760,word__christian
+2.715,word__god
+2.693,word__hare
+2.598,word__the beast
+2.581,word__666
+2.404,word__thyagi
+2.385,word__tyre
+2.373,word__morality


In [None]:
#max_df = 0.25 і 0.25, max_features = 40000, 30000
predictions_7 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_7)

0.8276686139139671

In [None]:
eli5.show_prediction(pipeline_4[1], test["data"][1500], top = 100, vec=pipeline_4[0],
feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
+0.064,char: Highlighted in text (sum)
… 486 more positive …,… 486 more positive …
… 396 more negative …,… 396 more negative …
-0.313,word: Highlighted in text (sum)
-0.867,<BIAS>

Contribution?,Feature
+0.528,<BIAS>
… 282 more positive …,… 282 more positive …
… 600 more negative …,… 600 more negative …
-0.058,word: Highlighted in text (sum)
-0.442,char: Highlighted in text (sum)

Contribution?,Feature
+0.366,<BIAS>
… 338 more positive …,… 338 more positive …
… 544 more negative …,… 544 more negative …
-0.383,char: Highlighted in text (sum)
-0.436,word: Highlighted in text (sum)

Contribution?,Feature
+0.283,<BIAS>
… 344 more positive …,… 344 more positive …
… 538 more negative …,… 538 more negative …
-0.472,word: Highlighted in text (sum)
-0.733,char: Highlighted in text (sum)

Contribution?,Feature
+0.345,<BIAS>
… 330 more positive …,… 330 more positive …
… 552 more negative …,… 552 more negative …
-0.345,char: Highlighted in text (sum)
-0.450,word: Highlighted in text (sum)

Contribution?,Feature
+0.522,<BIAS>
+0.136,char: Highlighted in text (sum)
… 335 more positive …,… 335 more positive …
… 547 more negative …,… 547 more negative …
-0.482,word: Highlighted in text (sum)

Contribution?,Feature
+0.909,char: Highlighted in text (sum)
+0.476,<BIAS>
+0.314,word: Highlighted in text (sum)
… 331 more positive …,… 331 more positive …
… 551 more negative …,… 551 more negative …

Contribution?,Feature
+0.392,word: Highlighted in text (sum)
+0.271,<BIAS>
+0.139,char: Highlighted in text (sum)
… 379 more positive …,… 379 more positive …
… 503 more negative …,… 503 more negative …

Contribution?,Feature
+0.176,<BIAS>
+0.021,word: Highlighted in text (sum)
… 357 more positive …,… 357 more positive …
… 525 more negative …,… 525 more negative …
-0.268,char: Highlighted in text (sum)

Contribution?,Feature
+0.422,<BIAS>
… 336 more positive …,… 336 more positive …
… 546 more negative …,… 546 more negative …
-0.178,char: Highlighted in text (sum)
-0.198,word: Highlighted in text (sum)

Contribution?,Feature
+0.152,<BIAS>
+0.064,char: Highlighted in text (sum)
… 333 more positive …,… 333 more positive …
… 549 more negative …,… 549 more negative …
-0.128,word: Highlighted in text (sum)

Contribution?,Feature
… 353 more positive …,… 353 more positive …
… 529 more negative …,… 529 more negative …
-0.054,word: Highlighted in text (sum)
-0.215,char: Highlighted in text (sum)
-0.755,<BIAS>

Contribution?,Feature
+0.357,<BIAS>
+0.056,char: Highlighted in text (sum)
… 313 more positive …,… 313 more positive …
… 569 more negative …,… 569 more negative …
-0.382,word: Highlighted in text (sum)

Contribution?,Feature
… 402 more positive …,… 402 more positive …
… 480 more negative …,… 480 more negative …
-0.099,<BIAS>
-0.377,word: Highlighted in text (sum)
-0.386,char: Highlighted in text (sum)

Contribution?,Feature
+0.539,word: Highlighted in text (sum)
+0.349,char: Highlighted in text (sum)
… 418 more positive …,… 418 more positive …
… 464 more negative …,… 464 more negative …
-0.218,<BIAS>

Contribution?,Feature
+0.762,char: Highlighted in text (sum)
+0.097,<BIAS>
… 400 more positive …,… 400 more positive …
… 482 more negative …,… 482 more negative …
-0.076,word: Highlighted in text (sum)

Contribution?,Feature
… 379 more positive …,… 379 more positive …
… 503 more negative …,… 503 more negative …
-0.352,<BIAS>
-0.507,word: Highlighted in text (sum)
-0.826,char: Highlighted in text (sum)

Contribution?,Feature
+0.532,char: Highlighted in text (sum)
+0.213,word: Highlighted in text (sum)
… 425 more positive …,… 425 more positive …
… 457 more negative …,… 457 more negative …
-0.383,<BIAS>

Contribution?,Feature
+4.152,word: Highlighted in text (sum)
+2.376,char: Highlighted in text (sum)
… 590 more positive …,… 590 more positive …
… 292 more negative …,… 292 more negative …
-0.733,<BIAS>

Contribution?,Feature
+2.240,char: Highlighted in text (sum)
+0.489,word: Highlighted in text (sum)
… 411 more positive …,… 411 more positive …
… 471 more negative …,… 471 more negative …
-0.590,<BIAS>


In [None]:
# max_features = 40000, max_df = 0.85, min_df = 2, max_features = 30000,max_df = 0.85, min_df = 2))
predictions_6 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_6)

0.8252788104089219

In [None]:
eli5.show_prediction(pipeline_4[1], test["data"][1500], top = 100, vec=pipeline_4[0],
feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
+0.062,word: Highlighted in text (sum)
… 753 more positive …,… 753 more positive …
… 606 more negative …,… 606 more negative …
-0.109,char: Highlighted in text (sum)
-1.180,<BIAS>

Contribution?,Feature
+0.713,<BIAS>
… 463 more positive …,… 463 more positive …
… 896 more negative …,… 896 more negative …
-0.166,word: Highlighted in text (sum)
-0.402,char: Highlighted in text (sum)

Contribution?,Feature
+0.598,<BIAS>
… 524 more positive …,… 524 more positive …
… 835 more negative …,… 835 more negative …
-0.337,char: Highlighted in text (sum)
-0.534,word: Highlighted in text (sum)

Contribution?,Feature
+0.470,<BIAS>
… 530 more positive …,… 530 more positive …
… 829 more negative …,… 829 more negative …
-0.627,word: Highlighted in text (sum)
-0.639,char: Highlighted in text (sum)

Contribution?,Feature
+0.357,<BIAS>
… 572 more positive …,… 572 more positive …
… 787 more negative …,… 787 more negative …
-0.271,char: Highlighted in text (sum)
-0.584,word: Highlighted in text (sum)

Contribution?,Feature
+0.842,<BIAS>
+0.088,char: Highlighted in text (sum)
… 502 more positive …,… 502 more positive …
… 857 more negative …,… 857 more negative …
-0.729,word: Highlighted in text (sum)

Contribution?,Feature
+0.733,<BIAS>
+0.533,char: Highlighted in text (sum)
+0.039,word: Highlighted in text (sum)
… 562 more positive …,… 562 more positive …
… 797 more negative …,… 797 more negative …

Contribution?,Feature
+0.446,word: Highlighted in text (sum)
+0.143,<BIAS>
+0.105,char: Highlighted in text (sum)
… 639 more positive …,… 639 more positive …
… 720 more negative …,… 720 more negative …

Contribution?,Feature
+0.127,word: Highlighted in text (sum)
… 632 more positive …,… 632 more positive …
… 727 more negative …,… 727 more negative …
-0.032,<BIAS>
-0.211,char: Highlighted in text (sum)

Contribution?,Feature
+0.525,<BIAS>
… 598 more positive …,… 598 more positive …
… 761 more negative …,… 761 more negative …
-0.133,char: Highlighted in text (sum)
-0.295,word: Highlighted in text (sum)

Contribution?,Feature
+0.434,<BIAS>
+0.079,char: Highlighted in text (sum)
… 546 more positive …,… 546 more positive …
… 813 more negative …,… 813 more negative …
-0.382,word: Highlighted in text (sum)

Contribution?,Feature
… 647 more positive …,… 647 more positive …
… 712 more negative …,… 712 more negative …
-0.026,word: Highlighted in text (sum)
-0.181,char: Highlighted in text (sum)
-0.904,<BIAS>

Contribution?,Feature
+0.385,<BIAS>
+0.057,char: Highlighted in text (sum)
… 522 more positive …,… 522 more positive …
… 837 more negative …,… 837 more negative …
-0.332,word: Highlighted in text (sum)

Contribution?,Feature
… 609 more positive …,… 609 more positive …
… 750 more negative …,… 750 more negative …
-0.168,word: Highlighted in text (sum)
-0.238,<BIAS>
-0.311,char: Highlighted in text (sum)

Contribution?,Feature
+0.524,word: Highlighted in text (sum)
+0.295,char: Highlighted in text (sum)
… 705 more positive …,… 705 more positive …
… 654 more negative …,… 654 more negative …
-0.338,<BIAS>

Contribution?,Feature
+0.651,char: Highlighted in text (sum)
+0.281,<BIAS>
+0.069,word: Highlighted in text (sum)
… 566 more positive …,… 566 more positive …
… 793 more negative …,… 793 more negative …

Contribution?,Feature
… 656 more positive …,… 656 more positive …
… 703 more negative …,… 703 more negative …
-0.463,word: Highlighted in text (sum)
-0.503,<BIAS>
-0.766,char: Highlighted in text (sum)

Contribution?,Feature
+0.671,char: Highlighted in text (sum)
+0.311,word: Highlighted in text (sum)
… 703 more positive …,… 703 more positive …
… 656 more negative …,… 656 more negative …
-0.446,<BIAS>

Contribution?,Feature
+4.441,word: Highlighted in text (sum)
+2.141,char: Highlighted in text (sum)
… 926 more positive …,… 926 more positive …
… 433 more negative …,… 433 more negative …
-1.058,<BIAS>

Contribution?,Feature
+1.965,char: Highlighted in text (sum)
+0.751,word: Highlighted in text (sum)
… 687 more positive …,… 687 more positive …
… 672 more negative …,… 672 more negative …
-0.783,<BIAS>


In [None]:
#max_df = 0.8
predictions_5 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_5)

0.8243494423791822

In [None]:
# max_features = 15000, max_df = 0.6, min_df = 2, max_features = 20000,max_df = 0.65, min_df = 2))
predictions_5 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_5)

0.8113382899628253

In [None]:
# max_features = 30000, max_df = 0.6, min_df = 2, max_features = 30000,max_df = 0.65, min_df = 2))
predictions_5 = pipeline_4.predict(test["data"])
accuracy_score(test["target"], predictions_5)

0.8244822092405736

In [None]:
eli5.show_prediction(pipeline_4[1], test["data"][1500], top = 100, vec=pipeline_4[0],
feature_names=pipeline_4[0].get_feature_names_out(), target_names=test.target_names)

Contribution?,Feature
… 699 more positive …,… 699 more positive …
… 565 more negative …,… 565 more negative …
-0.078,char: Highlighted in text (sum)
-0.155,word: Highlighted in text (sum)
-1.032,<BIAS>

Contribution?,Feature
+0.664,<BIAS>
… 431 more positive …,… 431 more positive …
… 833 more negative …,… 833 more negative …
-0.139,word: Highlighted in text (sum)
-0.436,char: Highlighted in text (sum)

Contribution?,Feature
+0.579,<BIAS>
… 490 more positive …,… 490 more positive …
… 774 more negative …,… 774 more negative …
-0.359,char: Highlighted in text (sum)
-0.591,word: Highlighted in text (sum)

Contribution?,Feature
+0.386,<BIAS>
… 499 more positive …,… 499 more positive …
… 765 more negative …,… 765 more negative …
-0.661,word: Highlighted in text (sum)
-0.678,char: Highlighted in text (sum)

Contribution?,Feature
+0.333,<BIAS>
… 516 more positive …,… 516 more positive …
… 748 more negative …,… 748 more negative …
-0.285,char: Highlighted in text (sum)
-0.657,word: Highlighted in text (sum)

Contribution?,Feature
+0.719,<BIAS>
+0.097,char: Highlighted in text (sum)
… 473 more positive …,… 473 more positive …
… 791 more negative …,… 791 more negative …
-0.655,word: Highlighted in text (sum)

Contribution?,Feature
+0.803,char: Highlighted in text (sum)
+0.641,<BIAS>
+0.237,word: Highlighted in text (sum)
… 515 more positive …,… 515 more positive …
… 749 more negative …,… 749 more negative …

Contribution?,Feature
+0.555,word: Highlighted in text (sum)
+0.146,<BIAS>
+0.098,char: Highlighted in text (sum)
… 586 more positive …,… 586 more positive …
… 678 more negative …,… 678 more negative …

Contribution?,Feature
+0.210,word: Highlighted in text (sum)
… 582 more positive …,… 582 more positive …
… 682 more negative …,… 682 more negative …
-0.051,<BIAS>
-0.206,char: Highlighted in text (sum)

Contribution?,Feature
+0.552,<BIAS>
… 554 more positive …,… 554 more positive …
… 710 more negative …,… 710 more negative …
-0.137,char: Highlighted in text (sum)
-0.309,word: Highlighted in text (sum)

Contribution?,Feature
+0.342,<BIAS>
+0.054,char: Highlighted in text (sum)
… 505 more positive …,… 505 more positive …
… 759 more negative …,… 759 more negative …
-0.306,word: Highlighted in text (sum)

Contribution?,Feature
… 581 more positive …,… 581 more positive …
… 683 more negative …,… 683 more negative …
-0.094,word: Highlighted in text (sum)
-0.201,char: Highlighted in text (sum)
-0.895,<BIAS>

Contribution?,Feature
+0.373,<BIAS>
+0.054,char: Highlighted in text (sum)
… 473 more positive …,… 473 more positive …
… 791 more negative …,… 791 more negative …
-0.361,word: Highlighted in text (sum)

Contribution?,Feature
… 575 more positive …,… 575 more positive …
… 689 more negative …,… 689 more negative …
-0.183,<BIAS>
-0.279,char: Highlighted in text (sum)
-0.300,word: Highlighted in text (sum)

Contribution?,Feature
+0.520,word: Highlighted in text (sum)
+0.307,char: Highlighted in text (sum)
… 635 more positive …,… 635 more positive …
… 629 more negative …,… 629 more negative …
-0.274,<BIAS>

Contribution?,Feature
+0.606,char: Highlighted in text (sum)
+0.295,<BIAS>
… 533 more positive …,… 533 more positive …
… 731 more negative …,… 731 more negative …
-0.130,word: Highlighted in text (sum)

Contribution?,Feature
… 594 more positive …,… 594 more positive …
… 670 more negative …,… 670 more negative …
-0.449,<BIAS>
-0.553,word: Highlighted in text (sum)
-0.741,char: Highlighted in text (sum)

Contribution?,Feature
+0.607,char: Highlighted in text (sum)
+0.321,word: Highlighted in text (sum)
… 656 more positive …,… 656 more positive …
… 608 more negative …,… 608 more negative …
-0.431,<BIAS>

Contribution?,Feature
+4.588,word: Highlighted in text (sum)
+2.112,char: Highlighted in text (sum)
… 859 more positive …,… 859 more positive …
… 405 more negative …,… 405 more negative …
-0.975,<BIAS>

Contribution?,Feature
+1.919,char: Highlighted in text (sum)
+0.652,word: Highlighted in text (sum)
… 641 more positive …,… 641 more positive …
… 623 more negative …,… 623 more negative …
-0.739,<BIAS>
