In [1]:
import warnings
import nltk
from sklearn.datasets import fetch_20newsgroups
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
categories = ['comp.os.ms-windows.misc', 'sci.med', 'talk.politics.mideast']
remove = ('headers', 'footers', 'quotes')

twenty_train_full = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove)
twenty_test_full = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove)

In [3]:
twenty_train_full.data[0]

"(Amir Y Rosenblatt) writes\n   > Sam Zbib Writes\n   >>No one in his right mind would sell his freedom and dignity.\n   >>Palestinians are no exception. Perhaps you heard about\n   >>anti-trust in the business world.\n   >>\n   >>Since we are debating the legality of a commercial\n   >>transaction, we must use the laws governing the guidelines\n   >>and ethics of such transactions. Basic ANTI-TRUST law says\n   >>that, while you can purchase IBM stocks for the purpose of\n   >>investing, you can not acquire a large number of those\n   >>shares with the intent or controlling IBM. You can do so\n   >>only if you make your intentions CLEAR apriori . Clearly,\n   >>the Jews who purchased properties from palastenians had some\n   >>designs, they were not buying a dwelling or a real estate.\n   >They were establishing a bridgehead for the European Jews.\n   >>\n   >>The palastenians sold their properties to the Jews in the\n   >>old tradition of arab hospitality. Being a multi-ethnic /\n   

In [4]:
twenty_test_full.data[0]

"\n\n\n\n\n\tMillipedes, I understand, are vegetarian, and therefore almost\ncertainly will not bite and are not poisonous. Centipedes are\ncarnivorous, and although I don't have any absolute knowledge on this, I\nwould tend to think that you're in no danger from anything but a\nconcerted assault by several million of them."

In [5]:
import nltk
from nltk import word_tokenize
from nltk.stem import *

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
def stemming(data):
    porter_stemmer = PorterStemmer()
    stem = []
    for text in data:
        nltk_tokens = word_tokenize(text)
        line = ''.join([' ' + porter_stemmer.stem(word) for word in nltk_tokens])
        stem.append(line)
    return stem

In [7]:
stem_train = stemming(twenty_train_full.data)
stem_test = stemming(twenty_test_full.data)

In [8]:
stem_train[0]

" ( amir y rosenblatt ) write > sam zbib write > > no one in hi right mind would sell hi freedom and digniti . > > palestinian are no except . perhap you heard about > > anti-trust in the busi world . > > > > sinc we are debat the legal of a commerci > > transact , we must use the law govern the guidelin > > and ethic of such transact . basic anti-trust law say > > that , while you can purchas ibm stock for the purpos of > > invest , you can not acquir a larg number of those > > share with the intent or control ibm . you can do so > > onli if you make your intent clear apriori . clearli , > > the jew who purchas properti from palastenian had some > > design , they were not buy a dwell or a real estat . > they were establish a bridgehead for the european jew . > > > > the palastenian sold their properti to the jew in the > > old tradit of arab hospit . be a multi-ethn / > > multi-religi societi , accept the jew as neighbour > > wa no differ , just anoth religion . plu they paid fair > >

In [9]:
stem_test[0]

" milliped , i understand , are vegetarian , and therefor almost certainli will not bite and are not poison . centiped are carnivor , and although i do n't have ani absolut knowledg on thi , i would tend to think that you 're in no danger from anyth but a concert assault by sever million of them ."

In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vect_without_stop = CountVectorizer(max_features=10000)

In [12]:
train_data = vect_without_stop.fit_transform(twenty_train_full.data)
test_data = vect_without_stop.transform(twenty_test_full.data)

In [13]:
def sort_by_tf(input_str):
    return input_str[1]

def top_terms(vector, data, count):
    x = list(zip(vector.get_feature_names_out(), np.ravel(data.sum(axis=0))))
    x.sort(key=sort_by_tf, reverse=True)
    return x[:count]

In [14]:
top_terms_without_stop = [{term[0]: term[1]} for term in top_terms(vect_without_stop, train_data, 20)]
top_terms_without_stop

top_terms_without_stop_test = [{term[0]: term[1]} for term in top_terms(vect_without_stop, test_data, 20)]
top_terms_without_stop_test

[{'the': 14539},
 {'of': 7431},
 {'to': 6719},
 {'and': 6387},
 {'in': 4807},
 {'that': 3273},
 {'is': 3208},
 {'it': 2537},
 {'for': 2385},
 {'you': 1870},
 {'on': 1825},
 {'this': 1659},
 {'with': 1630},
 {'as': 1550},
 {'was': 1516},
 {'have': 1515},
 {'are': 1505},
 {'not': 1498},
 {'be': 1363},
 {'by': 1356}]

In [15]:
vect_stop = CountVectorizer(max_features=10000, stop_words='english')

In [16]:
train_data_stop = vect_stop.fit_transform(twenty_train_full.data)
test_data_stop = vect_stop.transform(twenty_test_full.data)

In [17]:
top_terms_stop = [{term[0]: term[1]} for term in top_terms(vect_stop, train_data_stop, 20)]
top_terms_stop

top_terms_stop_test = [{term[0]: term[1]} for term in top_terms(vect_stop, test_data_stop, 20)]
top_terms_stop_test

[{'people': 585},
 {'windows': 528},
 {'like': 471},
 {'just': 410},
 {'don': 405},
 {'know': 396},
 {'armenian': 392},
 {'said': 354},
 {'time': 338},
 {'armenians': 314},
 {'use': 313},
 {'does': 307},
 {'israel': 304},
 {'new': 302},
 {'ve': 275},
 {'think': 267},
 {'jews': 256},
 {'did': 255},
 {'file': 251},
 {'dos': 248}]

In [18]:
vect_stem_without_stop = CountVectorizer(max_features=10000)

In [19]:
train_data_without_stop_stem = vect_stem_without_stop.fit_transform(stem_train)
test_data_without_stop_stem = vect_stem_without_stop.transform(stem_test)

In [20]:
top_terms_stem = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, train_data_without_stop_stem, 20)]
top_terms_stem

top_terms_stem_test = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, test_data_without_stop_stem, 20)]
top_terms_stem_test

[{'the': 14539},
 {'of': 7431},
 {'to': 6719},
 {'and': 6387},
 {'in': 4808},
 {'that': 3281},
 {'is': 3260},
 {'it': 2850},
 {'for': 2385},
 {'you': 1869},
 {'on': 1826},
 {'thi': 1659},
 {'have': 1641},
 {'with': 1630},
 {'be': 1560},
 {'not': 1556},
 {'wa': 1550},
 {'as': 1546},
 {'are': 1537},
 {'do': 1355}]

In [21]:
vect_stem = CountVectorizer(max_features=10000, stop_words='english')

In [22]:
train_data_stop_stem = vect_stem.fit_transform(stem_train)
test_data_stop_stem = vect_stem.transform(stem_test)

In [23]:
top_terms_stop_stem = [{term[0]: term[1]} for term in top_terms(vect_stem, train_data_stop_stem, 20)]
top_terms_stop_stem

top_terms_stop_stem_test = [{term[0]: term[1]} for term in top_terms(vect_stem, test_data_stop_stem, 20)]
top_terms_stop_stem_test

[{'thi': 1659},
 {'wa': 1550},
 {'use': 740},
 {'ha': 734},
 {'armenian': 706},
 {'peopl': 606},
 {'window': 561},
 {'ani': 530},
 {'like': 506},
 {'know': 442},
 {'hi': 436},
 {'time': 432},
 {'just': 410},
 {'file': 391},
 {'doe': 388},
 {'muslim': 376},
 {'did': 375},
 {'onli': 357},
 {'year': 355},
 {'said': 354}]

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
tf = TfidfTransformer(use_idf=False)
tfidf = TfidfTransformer(use_idf=True)

In [26]:
train_data_tf = tf.fit_transform(train_data)
test_data_tf = tf.transform(test_data)

train_data_tfidf = tfidf.fit_transform(train_data)
test_data_tfidf = tfidf.transform(test_data)

In [27]:
top_terms_tf = [{term[0]: term[1]} for term in top_terms(vect_without_stop, train_data_tf, 20)]
top_terms_tf

top_terms_tf_test = [{term[0]: term[1]} for term in top_terms(vect_without_stop, test_data_tf, 20)]
top_terms_tf_test

top_terms_tfidf = [{term[0]: term[1]} for term in top_terms(vect_without_stop, train_data_tfidf, 20)]
top_terms_tfidf

top_terms_tfidf_test = [{term[0]: term[1]} for term in top_terms(vect_without_stop, test_data_tfidf, 20)]
top_terms_tfidf_test

[{'the': 157.81600652977625},
 {'to': 95.76882725313655},
 {'of': 88.3511825497112},
 {'and': 76.27572679535747},
 {'in': 62.28808170697198},
 {'is': 59.99523519793167},
 {'that': 58.0275724174942},
 {'it': 51.405158372353476},
 {'you': 48.90835882309294},
 {'for': 45.02428709408522},
 {'this': 36.53119998360545},
 {'have': 34.9709106757255},
 {'on': 33.861034197852725},
 {'are': 33.446024082297626},
 {'not': 33.381554648365984},
 {'as': 31.470663554161135},
 {'be': 29.998314554593797},
 {'or': 28.536178873638235},
 {'with': 28.50341688678757},
 {'was': 26.767637478284687}]

In [28]:
tf = TfidfTransformer(use_idf=False)
tfidf = TfidfTransformer(use_idf=True)

In [29]:
train_data_stop_tf = tf.fit_transform(train_data_stop)
test_data_stop_tf = tf.transform(test_data_stop)

train_data_stop_tfidf = tfidf.fit_transform(train_data_stop)
test_data_stop_tfidf = tfidf.transform(test_data_stop)

In [30]:
top_terms_stop_tf = [{term[0]: term[1]} for term in top_terms(vect_stop, train_data_stop_tf, 20)]
top_terms_stop_tf

top_terms_stop_tf_test = [{term[0]: term[1]} for term in top_terms(vect_stop, test_data_stop_tf, 20)]
top_terms_stop_tf_test

top_terms_stop_tfidf = [{term[0]: term[1]} for term in top_terms(vect_stop, train_data_stop_tfidf, 20)]
top_terms_stop_tfidf

top_terms_stop_tfidf_test = [{term[0]: term[1]} for term in top_terms(vect_stop, test_data_stop_tfidf, 20)]
top_terms_stop_tfidf_test

[{'windows': 29.89251649051573},
 {'don': 20.52290818684906},
 {'people': 20.21588163348893},
 {'like': 19.83785627913987},
 {'know': 19.16691728099501},
 {'just': 18.379546721380088},
 {'does': 17.749860903200577},
 {'dos': 17.002378078926135},
 {'think': 15.645049018787713},
 {'os': 14.225986842543938},
 {'thanks': 13.702781562424326},
 {'israel': 13.572715158145664},
 {'use': 13.491902739171277},
 {'good': 13.427650870560498},
 {'ve': 13.407031746857905},
 {'way': 12.808019975593862},
 {'file': 12.503178767171308},
 {'time': 12.095394512813783},
 {'did': 12.014543207096631},
 {'jews': 11.689308836949936}]

In [31]:
tf = TfidfTransformer(use_idf=False)
tfidf = TfidfTransformer(use_idf=True)

In [32]:
train_data_stem_tf = tf.fit_transform(train_data_without_stop_stem)
test_data_stem_tf = tf.transform(test_data_without_stop_stem)

train_data_stem_tfidf = tfidf.fit_transform(train_data_without_stop_stem)
test_data_stem_tfidf = tfidf.transform(test_data_without_stop_stem)

In [33]:
top_terms_stem_tf = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, train_data_stem_tf, 20)]
top_terms_stem_tf

top_terms_stem_tf_test = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, test_data_stem_tf, 20)]
top_terms_stem_tf_test

top_terms_stem_tfidf = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, train_data_stem_tfidf, 20)]
top_terms_stem_tfidf

top_terms_stem_tfidf_test = [{term[0]: term[1]} for term in top_terms(vect_stem_without_stop, test_data_stem_tfidf, 20)]
top_terms_stem_tfidf_test

[{'the': 155.17937196886095},
 {'to': 94.62983334647986},
 {'of': 86.6786435883915},
 {'and': 75.05980035123291},
 {'in': 61.25732116324329},
 {'is': 60.47443793184465},
 {'that': 57.56681854087221},
 {'it': 53.887881597239605},
 {'you': 48.53260429506063},
 {'for': 44.42326699464681},
 {'do': 39.206613260817264},
 {'have': 36.45202385352554},
 {'thi': 36.140426758521855},
 {'not': 33.85327299216085},
 {'are': 33.68305411470661},
 {'on': 33.305383486442175},
 {'be': 32.42981012200503},
 {'as': 31.091460789605534},
 {'or': 28.326119993701457},
 {'with': 27.992601514584756}]

In [34]:
tf = TfidfTransformer(use_idf=False)
tfidf = TfidfTransformer(use_idf=True)

In [35]:
train_data_stem_stop_tf = tf.fit_transform(train_data_stop_stem)
test_data_stem_stop_tf = tf.transform(test_data_stop_stem)

train_data_stem_stop_tfidf = tfidf.fit_transform(train_data_stop_stem)
test_data_stem_stop_tfidf = tfidf.transform(test_data_stop_stem)

In [36]:
top_terms_stem_stop_tf = [{term[0]: term[1]} for term in top_terms(vect_stem, train_data_stop_tf, 20)]
top_terms_stem_stop_tf

top_terms_stem_stop_tf_test = [{term[0]: term[1]} for term in top_terms(vect_stem, test_data_stop_tf, 20)]
top_terms_stem_stop_tf_test

top_terms_stem_stop_tfidf = [{term[0]: term[1]} for term in top_terms(vect_stem, train_data_stop_tf, 20)]
top_terms_stem_stop_tfidf

top_terms_stem_stop_tfidf_test = [{term[0]: term[1]} for term in top_terms(vect_stem, test_data_stop_tf, 20)]
top_terms_stem_stop_tfidf_test

[{'wmx': 49.928307001405756},
 {'leagu': 39.428873407818834},
 {'dens': 37.81290632375341},
 {'pe': 37.776238966326034},
 {'k80': 36.1173101578041},
 {'iy': 35.86977122072824},
 {'delusion': 30.415585871642556},
 {'transcript': 26.82596604853647},
 {'van': 22.760988077071403},
 {'troop': 20.852232534795434},
 {'depress': 20.600786271752604},
 {'fungu': 20.533030553887926},
 {'widespread': 20.393145581655908},
 {'tr': 20.12425798328614},
 {'virtu': 20.033861107196966},
 {'ink': 19.01196128715913},
 {'d1': 18.222145928587427},
 {'promin': 17.76147664514586},
 {'dissid': 16.82916991056713},
 {'explos': 16.788450275905518}]

In [37]:
import pandas as pd

In [38]:
columns = pd.MultiIndex.from_product([['Count', 'TF', 'TF-IDF'], ['Без стоп-слов', 'С стоп-словами']])

In [39]:
df1 = pd.DataFrame(columns=columns)

df1['Count', 'Без стоп-слов'] = top_terms_without_stop
df1['TF', 'Без стоп-слов'] = top_terms_tf
df1['TF-IDF', 'Без стоп-слов'] = top_terms_tfidf

df1['Count', 'С стоп-словами'] = top_terms_stop
df1['TF', 'С стоп-словами'] = top_terms_stop_tf
df1['TF-IDF', 'С стоп-словами'] = top_terms_stop_tfidf

df1

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,{'ax': 62375},{'ax': 62375},{'the': 572.5279808540205},{'windows': 66.67395477054924},{'the': 229.06126555168856},{'windows': 38.703929569272574}
1,{'the': 19976},{'max': 4475},{'to': 316.6842067333636},{'know': 55.58408988539479},{'to': 131.43055242905746},{'know': 29.26659456172411}
2,{'of': 10085},{'g9v': 1166},{'of': 276.5629209395146},{'just': 52.966420982934565},{'of': 125.15513289020626},{'people': 27.37639366571801}
3,{'to': 9555},{'b8f': 1111},{'and': 235.55936019732573},{'like': 52.62319604161889},{'and': 104.57880639093214},{'just': 27.126745276279078}
4,{'and': 8963},{'people': 1037},{'is': 207.93746375638847},{'people': 51.95889281516506},{'is': 92.18661169101847},{'don': 26.59188993585221}
5,{'in': 6603},{'a86': 916},{'in': 188.74633407506516},{'don': 49.042693087083144},{'in': 89.40058982207171},{'like': 26.07689275309705}
6,{'that': 5088},{'pl': 826},{'it': 186.8837907643837},{'does': 40.575102798866936},{'it': 87.57693261450338},{'israel': 25.409328160691086}
7,{'is': 4875},{'145': 758},{'that': 165.24581293629535},{'think': 39.41225512825948},{'that': 81.55691582302615},{'edu': 24.064161463125053}
8,{'max': 4475},{'don': 690},{'you': 133.22294711173362},{'edu': 38.150401375813914},{'you': 76.2185447021307},{'does': 23.835573473709914}
9,{'it': 4369},{'know': 684},{'for': 115.24346051918967},{'use': 37.00856397329775},{'for': 59.885117686309485},{'think': 22.53184825960246}


In [40]:
df2 = pd.DataFrame(columns=columns)

df2['Count', 'Без стоп-слов'] = top_terms_without_stop_test
df2['TF', 'Без стоп-слов'] = top_terms_tf_test
df2['TF-IDF', 'Без стоп-слов'] = top_terms_tfidf_test

df2['Count', 'С стоп-словами'] = top_terms_stop_test
df2['TF', 'С стоп-словами'] = top_terms_stop_tf_test
df2['TF-IDF', 'С стоп-словами'] = top_terms_stop_tfidf_test

df2

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,{'the': 14539},{'people': 585},{'the': 383.7572889224506},{'windows': 49.928307001405756},{'the': 157.81600652977625},{'windows': 29.89251649051573}
1,{'of': 7431},{'windows': 528},{'to': 226.24130548502015},{'like': 39.428873407818834},{'to': 95.76882725313655},{'don': 20.52290818684906}
2,{'to': 6719},{'like': 471},{'of': 191.5853597991806},{'don': 37.81290632375341},{'of': 88.3511825497112},{'people': 20.21588163348893}
3,{'and': 6387},{'just': 410},{'and': 169.1241503600053},{'people': 37.776238966326034},{'and': 76.27572679535747},{'like': 19.83785627913987}
4,{'in': 4807},{'don': 405},{'is': 133.4494756369226},{'know': 36.1173101578041},{'in': 62.28808170697198},{'know': 19.16691728099501}
5,{'that': 3273},{'know': 396},{'in': 128.51188330965388},{'just': 35.86977122072824},{'is': 59.99523519793167},{'just': 18.379546721380088}
6,{'is': 3208},{'armenian': 392},{'that': 115.54338090374101},{'does': 30.415585871642556},{'that': 58.0275724174942},{'does': 17.749860903200577}
7,{'it': 2537},{'said': 354},{'it': 108.9883845364941},{'think': 26.82596604853647},{'it': 51.405158372353476},{'dos': 17.002378078926135}
8,{'for': 2385},{'time': 338},{'for': 84.50121525814092},{'use': 22.760988077071403},{'you': 48.90835882309294},{'think': 15.645049018787713}
9,{'you': 1870},{'armenians': 314},{'you': 82.89183206317075},{'time': 20.852232534795434},{'for': 45.02428709408522},{'os': 14.225986842543938}


In [41]:
df3 = pd.DataFrame(columns=columns)

df3['Count', 'Без стоп-слов'] = top_terms_stem
df3['TF', 'Без стоп-слов'] = top_terms_stem_tf
df3['TF-IDF', 'Без стоп-слов'] = top_terms_stem_tfidf

df3['Count', 'С стоп-словами'] = top_terms_stop_stem
df3['TF', 'С стоп-словами'] = top_terms_stem_stop_tf
df3['TF-IDF', 'С стоп-словами'] = top_terms_stem_stop_tfidf

df3

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,{'ax': 62375},{'ax': 62375},{'the': 560.1897256671423},{'wmx': 66.67395477054924},{'the': 227.1417118032733},{'wmx': 66.67395477054924}
1,{'the': 19970},{'max': 4475},{'to': 309.55632785346563},{'k80': 55.58408988539479},{'to': 130.73030554017762},{'k80': 55.58408988539479}
2,{'of': 10085},{'wa': 2486},{'of': 270.5795792227874},{'iy': 52.966420982934565},{'of': 123.98778553507317},{'iy': 52.966420982934565}
3,{'to': 9555},{'thi': 2240},{'and': 230.1672066691976},{'leagu': 52.62319604161889},{'and': 103.61322213648639},{'leagu': 52.62319604161889}
4,{'and': 8964},{'g9v': 1166},{'is': 206.83912663494127},{'pe': 51.95889281516506},{'is': 92.91087071831504},{'pe': 51.95889281516506}
5,{'in': 6603},{'armenian': 1123},{'it': 194.11515982589904},{'dens': 49.042693087083144},{'it': 91.60531113863544},{'dens': 49.042693087083144}
6,{'that': 5094},{'b8f': 1111},{'in': 184.61455318184682},{'delusion': 40.575102798866936},{'in': 88.54739363212309},{'delusion': 40.575102798866936}
7,{'is': 4924},{'use': 1099},{'that': 162.16999219258358},{'transcript': 39.41225512825948},{'that': 81.40313291495075},{'transcript': 39.41225512825948}
8,{'it': 4751},{'peopl': 1050},{'you': 130.2003870591387},{'dissid': 38.150401375813914},{'you': 75.79667340323336},{'dissid': 38.150401375813914}
9,{'max': 4475},{'ha': 968},{'for': 112.4687466798475},{'van': 37.00856397329775},{'for': 59.512001531477},{'van': 37.00856397329775}


In [42]:
df4 = pd.DataFrame(columns=columns)

df4['Count', 'Без стоп-слов'] = top_terms_stem_test
df4['TF', 'Без стоп-слов'] = top_terms_stem_tf_test
df4['TF-IDF', 'Без стоп-слов'] = top_terms_stem_tfidf_test

df4['Count', 'С стоп-словами'] = top_terms_stop_stem_test
df4['TF', 'С стоп-словами'] = top_terms_stem_stop_tf_test
df4['TF-IDF', 'С стоп-словами'] = top_terms_stem_stop_tfidf_test

df4

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,{'the': 14539},{'thi': 1659},{'the': 374.24241916658707},{'wmx': 49.928307001405756},{'the': 155.17937196886095},{'wmx': 49.928307001405756}
1,{'of': 7431},{'wa': 1550},{'to': 220.49116629557315},{'leagu': 39.428873407818834},{'to': 94.62983334647986},{'leagu': 39.428873407818834}
2,{'to': 6719},{'use': 740},{'of': 186.69561702716828},{'dens': 37.81290632375341},{'of': 86.6786435883915},{'dens': 37.81290632375341}
3,{'and': 6387},{'ha': 734},{'and': 164.7050985151655},{'pe': 37.776238966326034},{'and': 75.05980035123291},{'pe': 37.776238966326034}
4,{'in': 4808},{'armenian': 706},{'is': 133.37237843660867},{'k80': 36.1173101578041},{'in': 61.25732116324329},{'k80': 36.1173101578041}
5,{'that': 3281},{'peopl': 606},{'in': 125.28657808406516},{'iy': 35.86977122072824},{'is': 60.47443793184465},{'iy': 35.86977122072824}
6,{'is': 3260},{'window': 561},{'it': 113.48811048113814},{'delusion': 30.415585871642556},{'that': 57.56681854087221},{'delusion': 30.415585871642556}
7,{'it': 2850},{'ani': 530},{'that': 113.0706694958328},{'transcript': 26.82596604853647},{'it': 53.887881597239605},{'transcript': 26.82596604853647}
8,{'for': 2385},{'like': 506},{'for': 82.20410445217483},{'van': 22.760988077071403},{'you': 48.53260429506063},{'van': 22.760988077071403}
9,{'you': 1869},{'know': 442},{'you': 80.82963860996323},{'troop': 20.852232534795434},{'for': 44.42326699464681},{'troop': 20.852232534795434}


In [43]:
import openpyxl

In [44]:
writer = pd.ExcelWriter('result.xlsx', engine='openpyxl')

df1.to_excel(writer, sheet_name='Train, wo stem')
df2.to_excel(writer, sheet_name='Test, wo stem')
df3.to_excel(writer, sheet_name='Train, with stem')
df4.to_excel(writer, sheet_name='Test, with stem')

writer.close()

In [45]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [46]:
stop_words = [None, 'english']
max_features_values = [100, 500, 1000, 2000, 3000, 4000, 5000]
use_tf = [True, False]
use_idf = [True, False]

In [47]:
def prepare(data, max_feature, stop_word, use_tf, use_idf):
    tf = None
    cv = CountVectorizer(max_features=max_feature, stop_words=stop_word)
    cv.fit(data)
    if use_tf:
        tf = TfidfTransformer(use_idf=use_idf)
        tf.fit(cv.transform(data))
    return cv, tf

In [48]:
result = []

for max_features_value in max_features_values:
    for stop_word in stop_words:
        for ut in use_tf:
            for ui in use_idf:
                options = {}
                cv, tf = prepare(twenty_train_full.data, max_features_value, stop_word, ut, ui)
                if tf:
                    clf = MultinomialNB()
                    clf.fit(tf.transform(cv.transform(twenty_train_full.data)), twenty_train_full.target)
                    prep_test = tf.transform(cv.transform(twenty_test_full.data))
                else:
                    clf = MultinomialNB()
                    clf.fit(cv.transform(twenty_train_full.data), twenty_train_full.target)
                    prep_test = cv.transform(twenty_test_full.data)

                options['features'] = max_features_value
                options['stop_words'] = stop_word
                options['use_tf'] = ut
                options['use_idf'] = ui

                result_data = classification_report(clf.predict(prep_test), twenty_test_full.target, output_dict=True)
                result_df = pd.DataFrame(result_data)
                result.append({
                    'df': result_df,
                    'options': options
                })

In [49]:
writer = pd.ExcelWriter('result_compare.xlsx', engine='openpyxl')

df = pd.DataFrame(columns=['Номер страницы', 'features', 'stop_words', 'use_tf', 'use_idf'])
for it, item in enumerate(result):
    for key, value in item['options'].items():
        df.at[it, key] = value
    df.at[it, 'Номер страницы'] = it

df.to_excel(writer, sheet_name='Оглавление')

for it, item in enumerate(result):
    df_new = pd.DataFrame(item['df'])
    df_new.to_excel(writer, sheet_name=f'Страница {it}')

writer.close()

In [50]:
from sklearn.pipeline import Pipeline

parameters = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf
}

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [51]:
from sklearn.model_selection import GridSearchCV

gscv = GridSearchCV(text_clf, param_grid=parameters)
gscv.fit(twenty_train_full.data, twenty_train_full.target)

In [52]:
print(classification_report(gscv.predict(twenty_test_full.data), twenty_test_full.target))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       358
           1       0.93      0.86      0.89       431
           2       0.93      0.93      0.93       377

    accuracy                           0.91      1166
   macro avg       0.91      0.91      0.91      1166
weighted avg       0.91      0.91      0.91      1166


In [53]:
gscv.best_params_

{'tfidf__use_idf': True, 'vect__max_features': 5000, 'vect__stop_words': None}