# Мультиклассовая классификация текстов постов пикабу по тегам

по мотивам статьи https://proglib.io/p/multi-class-classification/

In [1]:
import pandas as pd

filename_hot = "hot_dataset_processed.csv"
filename = "pikabu_dataset_good.csv"
filename_2000 = "pikabu_clean_dataset_2000.csv"

In [2]:
df = pd.read_csv(filename_hot)
df[['Text','Tags']].dropna().head()

Unnamed: 0,Text,Tags
0,делать брат капитальный ремонт квартира матерь...,мой сосед электричество текст воровство
1,пустой екатеринбургасползать радужность лоск п...,стишкипирожка свердловск екатеринбург текст
2,школа хорошист трояк бывать отличник вообще кр...,мой дифтерия учёба текст видео
3,пикаба,собака лестница ветеринарный воротник видео
7,недавно пост карантин отец свой дочь ввести же...,мой жетон деньга карантин ребёнок текст соврем...


In [3]:
df_t = df[['Text','Tags']].dropna()

In [4]:
import collections
counts = collections.Counter([y for x in df_t.values.flatten() for y in x.split()])

In [5]:
new_df = []

tags = list(counts.keys()) # потом можно использовать для очистки от лишних тегов

In [6]:
for index, row in df_t.iterrows():
    for tag in row['Tags'].split(' '):
        new_df.append([tag, row['Text']])

In [7]:
labels = ['Category', 'Text']
df_category = pd.DataFrame.from_records(new_df, columns=labels)
df_category.head()

Unnamed: 0,Category,Text
0,мой,делать брат капитальный ремонт квартира матерь...
1,сосед,делать брат капитальный ремонт квартира матерь...
2,электричество,делать брат капитальный ремонт квартира матерь...
3,текст,делать брат капитальный ремонт квартира матерь...
4,воровство,делать брат капитальный ремонт квартира матерь...


In [1]:
hot_categorised = "hot_categorised.csv"
categorised_2000 = "categorised_2000.csv"
good_categorised = "good_categorised.csv"

In [10]:
df_category.to_csv(hot_categorised)

## подключаем Spark 

In [2]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true').load(hot_categorised)

In [5]:
need_columns = ['Text', 'Category']
data = data.dropna()
data = data.select([column for column in data.columns if column in need_columns])
data.show(10)

+-------------+--------------------+
|     Category|                Text|
+-------------+--------------------+
|          мой|делать брат капит...|
|        сосед|делать брат капит...|
|электричество|делать брат капит...|
|        текст|делать брат капит...|
|    воровство|делать брат капит...|
|стишкипирожка|пустой екатеринбу...|
|   свердловск|пустой екатеринбу...|
| екатеринбург|пустой екатеринбу...|
|        текст|пустой екатеринбу...|
|          мой|школа хорошист тр...|
+-------------+--------------------+
only showing top 10 rows



In [15]:
data.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Text: string (nullable = true)



In [16]:
from pyspark.sql.functions import col

data.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(10)

+------------+-----+
|    Category|count|
+------------+-----+
|         мой|  530|
|  длиннопост|  341|
|       текст|  283|
|       видео|  243|
| коронавирус|  220|
|    карантин|  111|
|        юмор|   95|
|     история|   77|
|самоизоляция|   65|
|     негатив|   60|
+------------+-----+
only showing top 10 rows



In [17]:
data.groupBy("Text") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(10)

+--------------------+-----+
|                Text|count|
+--------------------+-----+
|              пикаба|  422|
|               автор|   59|
|             кина вк|   31|
|            источник|   30|
|анапа два день си...|   28|
|          бп молчать|   26|
|              группа|   24|
|пост подать идея ...|   24|
|один серия плакат...|   22|
|4 апрель прекрасн...|   22|
+--------------------+-----+
only showing top 10 rows



## предобработка текстов

In [6]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# токенизация
regexTokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern=" ")

# стоп-слова
add_stopwords = ["пикаба","один","два","а","и","р"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# мешок слов
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=1000000, minDF=10)

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)

dataset = pipelineFit.transform(data)
dataset.show(10)

+-------------+--------------------+--------------------+--------------------+--------------------+------+
|     Category|                Text|               words|            filtered|            features| label|
+-------------+--------------------+--------------------+--------------------+--------------------+------+
|          мой|делать брат капит...|[делать, брат, ка...|[делать, брат, ка...|(12941,[6,24,29,5...|   0.0|
|        сосед|делать брат капит...|[делать, брат, ка...|[делать, брат, ка...|(12941,[6,24,29,5...|  97.0|
|электричество|делать брат капит...|[делать, брат, ка...|[делать, брат, ка...|(12941,[6,24,29,5...|  99.0|
|        текст|делать брат капит...|[делать, брат, ка...|[делать, брат, ка...|(12941,[6,24,29,5...|   2.0|
|    воровство|делать брат капит...|[делать, брат, ка...|[делать, брат, ка...|(12941,[6,24,29,5...| 104.0|
|стишкипирожка|пустой екатеринбу...|[пустой, екатерин...|[пустой, екатерин...|(12941,[1099,1425...|1492.0|
|   свердловск|пустой екатеринбу...|[

In [8]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 4692
Test Dataset Count: 2014


In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Логистическая из коробки спарка (не работает)

In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

### Через Tf-Idf (тоже не работает из-за логистической)

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [None]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

In [None]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

In [None]:
predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
dataset.printSchema()

## Наивный байес

In [None]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61172)
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-8c23ec555506>", line 4, in <module>
    model = nb.fit(trainingData)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\base.py", line 132, in fit
    return self._fit(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\pyspark\ml\wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\jav

Exception ignored in: <function JavaObject.__init__.<locals>.<lambda> at 0x000002223EABDE58>
Traceback (most recent call last):
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py", line 1293, in <lambda>
    _garbage_collect_object and _garbage_collect_object(cc, id))
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py", line 625, in _garbage_collect_object
    gateway_client.garbage_collect_object(target_id)
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py", line 920, in garbage_collect_object
    "\ne\n")
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py", line 983, in send_command
    connection = self._get_connection()
  File "c:\users\138904\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py", line 931, in _get_connection
    connection = self._create_

In [None]:
predictions = model.transform(testData)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

## Случайный лес (не работает, out of memory)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

In [None]:
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# Выводы: на моем компьютере pySpark и Java не особо дружат