![title](https://i.gifer.com/Wvq4.gif)

In [276]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# Classification

In [277]:
dataset = pd.read_csv('data/wine.csv')

In [278]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


In [279]:
dataset.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [280]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [281]:
dataset.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [282]:
from sklearn.model_selection import train_test_split

In [283]:
x_data = dataset.drop('type', 1)
y_data = dataset['type']

In [284]:
x_train,x_test,y_train,y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 0)

In [285]:
x_train.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,0.221638,0.338935,-0.118168,0.295803,-0.291146,-0.343212,0.463726,-0.262313,0.291049,-0.097647,-0.067357
volatile acidity,0.221638,1.0,-0.382928,-0.195362,0.382497,-0.350482,-0.41122,0.279783,0.261677,0.226029,-0.049435,-0.265881
citric acid,0.338935,-0.382928,1.0,0.139735,0.035528,0.121818,0.180068,0.101461,-0.338008,0.062074,-0.005495,0.089651
residual sugar,-0.118168,-0.195362,0.139735,1.0,-0.120124,0.390734,0.491743,0.551769,-0.269667,-0.181824,-0.360911,-0.03462
chlorides,0.295803,0.382497,0.035528,-0.120124,1.0,-0.196291,-0.283328,0.36875,0.043795,0.402499,-0.254852,-0.195208
free sulfur dioxide,-0.291146,-0.350482,0.121818,0.390734,-0.196291,1.0,0.723476,0.008093,-0.140602,-0.18121,-0.173684,0.064886
total sulfur dioxide,-0.343212,-0.41122,0.180068,0.491743,-0.283328,0.723476,1.0,0.019114,-0.231499,-0.277166,-0.263781,-0.048292
density,0.463726,0.279783,0.101461,0.551769,0.36875,0.008093,0.019114,1.0,0.001176,0.254416,-0.677155,-0.298867
pH,-0.262313,0.261677,-0.338008,-0.269667,0.043795,-0.140602,-0.231499,0.001176,1.0,0.185127,0.132705,0.014665
sulphates,0.291049,0.226029,0.062074,-0.181824,0.402499,-0.18121,-0.277166,0.254416,0.185127,1.0,0.007379,0.045614


In [286]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [287]:
trans = Pipeline(
    [
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler(),
        )
    ]
)

In [288]:
data = trans.fit_transform(x_train)

In [289]:
dummy_data = pd.DataFrame(data, columns=x_train.columns.values)

In [290]:
dummy_data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [291]:
# Lets try feeding all data before doing some engineering 

In [292]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

In [293]:
label_encoder = LabelEncoder()

In [294]:
data_x = trans.transform(x_train)
data_y = label_encoder.fit_transform(y_train)

In [295]:
log = LogisticRegression().fit(data_x, data_y)
gradient = SGDClassifier().fit(data_x, data_y)
vector = SVC().fit(data_x, data_y)
KN = KNeighborsClassifier().fit(data_x, data_y)
ensemble = RandomForestClassifier().fit(data_x, data_y)
tree = DecisionTreeClassifier().fit(data_x, data_y)
naive = GaussianNB().fit(data_x, data_y)

In [296]:
class habab_classification_branch:
    def __init__(self, x, y, x_, y_, dict_=False):
        self.x_test = x
        self.y_test = y
        self.x_train = x_
        self.y_train = y_
        self.dict_ins = {}
        self.dict = dict_
        
    def LOG(self):
        model = LogisticRegression().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'Logistic Score: {model}')
        else:
            self.dict_ins['LOG'] = str(model)
    
    def GRAD(self):
        model = SGDClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'SGDClassifier: {model}')
        else:
            self.dict_ins['GRAD'] = str(model)
            
    def VECTOR(self):
        model = SVC().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'SVC: {model}')
        else:
            self.dict_ins['SVC'] = str(model)
    
    def KN(self):
        model = KNeighborsClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'KNeighborsClassifier: {model}')
        else:
            self.dict_ins['KN'] = str(model)
    
    def ense(self):
        model = RandomForestClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'RandomForestClassifier: {model}')
        else:
            self.dict_ins['FOREST'] = str(model)
    
    def tree(self):        
        model = DecisionTreeClassifier().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'DecisionTreeClassifier: {model}')
        else:
            self.dict_ins['TREE'] = str(model)
    
    def bayes(self):
        model = GaussianNB().fit(self.x_train, self.y_train).score(self.x_test, self.y_test)
        if self.dict == False:
            print(f'GaussianNB: {model}')
        else:
            self.dict_ins['NB'] = str(model)
    
    def check_return(self):
        if self.dict_ins == False:
            pass
        else:
            return self.dict_ins
    
    def start(self):
        print("""Staring Classification Training & Testing""")
        print()
        self.LOG()
        self.GRAD()
        self.VECTOR()
        self.KN()
        self.ense()
        self.tree()
        self.bayes()
        if self.dict_ins == False:
            pass
        else:
            return self.dict_ins

In [297]:
data_train_x = data_x
data_train_y = data_y
data_test_x = trans.transform(x_test)
data_test_y = label_encoder.fit_transform(y_test)

In [298]:
habab_classification_branch(data_test_x,data_test_y,data_train_x,data_train_y).start()

Staring Classification Training & Testing

Logistic Score: 0.9953846153846154
SGDClassifier: 0.9958974358974358
SVC: 0.9969230769230769
KNeighborsClassifier: 0.9933333333333333
RandomForestClassifier: 0.9943589743589744
DecisionTreeClassifier: 0.9851282051282051
GaussianNB: 0.9723076923076923


{}

In [299]:
# Lets see if we can play around with support vector and improve more ?

In [300]:
from sklearn.model_selection import GridSearchCV

In [301]:
SVC()

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [302]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

In [303]:
grid.fit(data_train_x, data_train_y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.770, total=   0.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.778, total=   0.7s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.771, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.771, total=   0.7s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.776, total=   0.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.990, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.996, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.987, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.992, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.998, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.998, total=   0.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.993, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.994, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.996, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.995, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.993, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.992, total=   0.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.991, total=   0.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.994, total=   0.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.994, total=   0.0s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:   33.3s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [304]:
grid.score(data_test_x, data_test_y) * 100 # Guess SVC Normal is better

99.64102564102564

# Really easy not that complex no need of visualization

In [445]:
# Shutting Down Machine 11:30 PM 7/12/2020
# Author: Habab