In [None]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.



In [1]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_layout()
plt.show()

Automatically created module for IPython interactive environment


<Figure size 2700x900 with 33 Axes>

In [6]:
%%time
import pandas as pd
import numpy as np
line = pd.read_csv('line_label.csv')
twitter = pd.read_csv('twitter_sampling_label.csv')

Wall time: 7.46 s


In [7]:
%%time
seed = 17
import numpy as np
'''#Random Forest Classifier'''
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=seed)

'''Support Vector Machines'''
from sklearn.svm import SVC
svm = SVC(gamma = 'auto', kernel='linear',  random_state=seed)

'''Decision Tree Classifier'''
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=seed)

'''Extra Tree Classifier'''
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=seed)

'''K-Nearest Neighbour'''
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

'''Multinomial Naive Bayes'''
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

'''Gaussian Naive Bayes'''
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

'''Logistic Regression'''
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='saga', random_state=seed)

'''Neural Network'''
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(random_state=seed)

'''ADA Boosting'''
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=100, random_state=seed)

'''Gradient Boosting'''
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=100, random_state=seed)

Wall time: 0 ns


In [8]:
%%time
line.dropna(subset=['Berita'], inplace=True)
twitter.dropna(subset=['tweet'], inplace=True)

Wall time: 30.9 ms


In [9]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

countvect = CountVectorizer(analyzer = "word", tokenizer = None, lowercase = None)
tfidf = TfidfVectorizer(analyzer = "word", tokenizer = None, lowercase = None)

x1 = line.Berita
y1 = line.Sentimen

x1cv = x1
x1tf = x1

x1cv = countvect.fit_transform(x1).toarray()
x1tf = tfidf.fit_transform(x1).toarray()

x2 = twitter.tweet
y2 = twitter.sentimen

x2cv = x2
x2tf = x2

x2cv = countvect.fit_transform(x2).toarray()
x2tf = tfidf.fit_transform(x2).toarray()

Wall time: 530 ms


In [16]:
line_cv = (x1cv,y1)
line_tf = (x1tf,y1)
twitter_cv = (x2cv,y2)
twitter_tf = (x2tf,y2)

In [19]:
names = ["Random Forest", "Support Vector Machine", "Decision Tree", "Extra Tree",
         "K-Nearest Neighbour", "Multinomial Naive Bayes", "Gaussian Naive Bayes", "Logistic Regression",
         "Neural Network", "ADAboos", "Gradient Boosting"]

In [20]:
classifiers = [
    rf,
    svm,
    dt,
    etc,
    knn,
    mnb,
    gnb,
    lr,
    nn,
    abc,
    gbc]

In [22]:
datasets = [line_cv, line_tf, twitter_cv, twitter_tf]

In [24]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

ModuleNotFoundError: No module named 'pytest'

In [25]:
!pip install pytest

Collecting pytest
  Downloading https://files.pythonhosted.org/packages/69/1d/2430053122a3c6106f7fd1ff0bc68eb73e27db8f951db70fcd942da52c7b/pytest-5.0.1-py3-none-any.whl (221kB)
Collecting atomicwrites>=1.0 (from pytest)
  Using cached https://files.pythonhosted.org/packages/52/90/6155aa926f43f2b2a22b01be7241be3bfd1ceaf7d0b3267213e8127d41f4/atomicwrites-1.3.0-py2.py3-none-any.whl
Collecting py>=1.5.0 (from pytest)
  Using cached https://files.pythonhosted.org/packages/76/bc/394ad449851729244a97857ee14d7cba61ddb268dce3db538ba2f2ba1f0f/py-1.8.0-py2.py3-none-any.whl
Collecting pluggy<1.0,>=0.12 (from pytest)
  Using cached https://files.pythonhosted.org/packages/06/ee/de89e0582276e3551df3110088bf20844de2b0e7df2748406876cc78e021/pluggy-0.12.0-py2.py3-none-any.whl
Collecting packaging (from pytest)
  Downloading https://files.pythonhosted.org/packages/ec/22/630ac83e8f8a9566c4f88038447ed9e16e6f10582767a01f31c769d9a71e/packaging-19.1-py2.py3-none-any.whl
Collecting more-itertools>=4.0.0 (from 

You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [27]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris() # the data
clf = KNeighborsClassifier() # the model

y =  iris.target # the target vector
n_features = iris.data.shape[1]

print('Feature Index , Accuracy obtained')
for i in range(n_features):
    X = iris.data[:, i].reshape(-1, 1)
    scores = cross_val_score(clf, X, y, cv = 5, scoring='accuracy') # cross-validated accuracy
    print('{}   {}'.format(i, scores.mean()))

Feature Index , Accuracy obtained
0   0.6466666666666667
1   0.5533333333333333
2   0.9466666666666667
3   0.96


In [31]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris() # the data
clf = KNeighborsClassifier() # the model

y =  iris.target # the target vector
n_features = iris.data.shape[1]

print('Feature Index , Accuracy obtained')
for i in range(n_features):
    X = iris.data[:, i].reshape(-1, 1)
    scores = cross_val_score(clf, X, y, cv = 5, scoring='accuracy') # cross-validated accuracy
    print('{}   {}'.format(i, scores.mean()))

(150, 4)

In [41]:
x1cv.shape[0]

5462

In [38]:
y1.shape

(5462,)

In [45]:
mnb.fit(x1cv,y1)
tes = mnb.predict_proba(x1cv)

In [77]:
coba = pd.DataFrame({"fitur" : x1, "true" : tes[:,1], "false" : tes[:,0]}).round(2)

In [78]:
coba.head()

Unnamed: 0,fitur,true,false
0,mantul cui coblos 1 sah coblos 2 iya sah ayo m...,1.0,0.0
1,enak duain,0.96,0.04
2,01 nnpasti menang golput,0.54,0.46
3,1 saaaaaaahjangan nyoblos lbh satuitu saaaah,0.08,0.92
4,pakai sandi sandi segalanmantap pilih jokowi,0.88,0.12


In [76]:
mnb.fit(x1tf,y1)
tes2 = mnb.predict_proba(x1tf)

In [79]:
coba2 = pd.DataFrame({"fitur" : x1, "true" : tes2[:,1], "false" : tes2[:,0]}).round(2)

In [80]:
coba2.head()

Unnamed: 0,fitur,true,false
0,mantul cui coblos 1 sah coblos 2 iya sah ayo m...,0.55,0.45
1,enak duain,0.78,0.22
2,01 nnpasti menang golput,0.46,0.54
3,1 saaaaaaahjangan nyoblos lbh satuitu saaaah,0.27,0.73
4,pakai sandi sandi segalanmantap pilih jokowi,0.57,0.43
