In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                  random_state=156)

x_train = train_news.data
y_train = train_news.target

test_news = fetch_20newsgroups(subset='test',  remove=('headers', 'footers', 'quotes'),
                             random_state=156)

x_test = test_news.data
y_test = test_news.target


In [4]:
# Count Vectorization으로 피처 벡터화 변환 수행.
cnt_vect = CountVectorizer()
cnt_vect.fit(x_train)

x_train_cnt_vect = cnt_vect.transform(x_train)
x_test_cnt_vect = cnt_vect.transform(x_test)

In [5]:
rfc = RandomForestClassifier(random_state=156)

In [6]:
rfc.fit(x_train_cnt_vect, y_train)
rfc_pred = rfc.predict(x_test_cnt_vect)

In [7]:
print(accuracy_score(y_test, rfc_pred))

0.5920074349442379


In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier(random_state=156)

dtree.fit(x_train_cnt_vect, y_train)
dt_pred = dtree.predict(x_test_cnt_vect)

In [9]:
print(accuracy_score(y_test, dt_pred))

0.41728624535315983


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=156)
gb_clf.fit(x_train_cnt_vect, y_train)
gb_pred = gb_clf.predict(x_test_cnt_vect)

In [13]:
print(accuracy_score(y_test, gb_pred))

0.6026287838555496


In [10]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', 
                                   ngram_range=(1,2), max_df=300)),
    ('lr_clf', LogisticRegression(C=10))
])

In [11]:
pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)

print(accuracy_score(y_test, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7010090281465746


In [14]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(x_train)
x_train_tfidf_vect = tfidf_vect.transform(x_train)
x_test_tfidf_vect = tfidf_vect.transform(x_test)

In [15]:
dtree = DecisionTreeClassifier(random_state=156)

dtree.fit(x_train_tfidf_vect, y_train)
dt_pred = dtree.predict(x_test_tfidf_vect)

In [16]:
print(accuracy_score(y_test, dt_pred))

0.4024163568773234


In [17]:
rfc = RandomForestClassifier(random_state=156)

rfc.fit(x_train_tfidf_vect, y_train)
rfc_pred = rfc.predict(x_test_tfidf_vect)

In [18]:
print(accuracy_score(y_test, rfc_pred))

0.5864312267657993


In [19]:
gb_clf = GradientBoostingClassifier(random_state=156)
gb_clf.fit(x_train_tfidf_vect, y_train)
gb_pred = gb_clf.predict(x_test_tfidf_vect)

In [20]:
print(accuracy_score(y_test, gb_pred))

0.5951938396176314


In [23]:
char = {'CountVectorizer':['0.592', '0.417', '0.603'],
       'TfidfVectorizer':['0.586', '0.402', '0.595']}

In [24]:
df_char = pd.DataFrame(char, columns=['CountVectorizer', 'TfidfVectorizer'],
                      index=['RandomForestClassifier', 'DecisionTreeClassifier', 
                             'GradientBoostingClassifier'])
df_char

Unnamed: 0,CountVectorizer,TfidfVectorizer
RandomForestClassifier,0.592,0.586
DecisionTreeClassifier,0.417,0.402
GradientBoostingClassifier,0.603,0.595
