In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

## SVM

In [2]:
df = pd.read_csv("glass.csv")
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
RI      214 non-null float64
Na      214 non-null float64
Mg      214 non-null float64
Al      214 non-null float64
Si      214 non-null float64
K       214 non-null float64
Ca      214 non-null float64
Ba      214 non-null float64
Fe      214 non-null float64
Type    214 non-null int64
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [4]:
pd.unique(df['Type'])

array([1, 2, 3, 5, 6, 7], dtype=int64)

In [5]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

Series([], dtype: float64)

In [6]:
x = df.drop(columns=['Type'], axis=1).values
y = df['Type'].values

In [7]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_scaler, y, test_size=0.1, random_state=2)

In [9]:
c_range = np.arange(0.1, 10.1, 0.1)
gamma_range = np.arange(0.1, 10.1, 0.1)
random_search = RandomizedSearchCV(SVC(random_state=2),
                           param_distributions={'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                                'C': c_range,'gamma': gamma_range},
                           cv=5, n_jobs=-1, verbose=1)

random_search.fit(x_train, y_train)
best = random_search.best_params_
print('RandomizedSearchCV best score:', random_search.best_score_)
print('SVC best params:', best)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


RandomizedSearchCV best score: 0.609375
SVC best params: {'kernel': 'linear', 'gamma': 5.3, 'C': 3.8000000000000003}


[Parallel(n_jobs=-1)]: Done  35 out of  50 | elapsed:    3.4s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.4s finished


In [10]:
classifier = SVC(kernel=best['kernel'], gamma=best['gamma'], C=best['C'])
classifier.fit(x_train, y_train)

SVC(C=3.8000000000000003, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=5.3, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
scores = cross_val_score(classifier, x_train, y_train, cv=5)
print("CV accuracy on train data: %0.2f (+/- %0.2f)" % (scores.mean(), 2 * scores.std()))

CV accuracy on train data: 0.61 (+/- 0.15)


In [12]:
preds = classifier.predict(x_test)
accuracy = accuracy_score(y_test, preds)
print('Prediction accuracy:', accuracy)

Prediction accuracy: 0.7272727272727273


In [13]:
confusion_matrix(y_test, preds)

array([[6, 1, 1, 0, 0, 0],
       [2, 6, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 3]], dtype=int64)

## Bayes

Pogoda = pochmurnie<br>
Tak: 4/9<br>
Nie: 0/5<br>
Suma: 4/14<br>

Temperatura = chłodno<br>
Tak: 3/9<br>
Nie: 1/5<br>
Suma: 4/14<br>

Wilgotność = wysoka<br>
Tak: 3/9<br>
Nie: 4/5<br>
Suma: 7/14<br>

Wiatr = brak<br>
Tak: 3/9<br>
Nie: 3/5<br>
Suma: 6/14<br>

Czy grać:<br>
Tak: 9/14<br>
Nie: 5/14<br>

#### 2.1. Policzyć P(B|TAK)

In [14]:
p_b_tak = (4/9)*(3/9)*(3/9)*(3/9)*(9/14)
print(p_b_tak)

0.010582010582010581


#### 2.2. Policzyć P(B|NIE)

In [15]:
p_b_nie = (1/14)*(1/5)*(4/5)*(3/5)*(5/14)  # korekta Laplace'a
print(p_b_nie)

0.0024489795918367346


#### 2.3. Policzyć P(B)

In [16]:
p_b = (4/14)*(4/14)*(7/14)*(6/14)
print(p_b)

0.017492711370262388


#### 2.4. Policzyć P(TAK|B)

In [17]:
p_tak_b = p_b_tak / p_b
print(p_tak_b)

0.6049382716049383


#### 2.5. Policzyć P(NIE|B)

In [18]:
p_nie_b = p_b_nie / p_b
print(p_nie_b)

0.14
