In [35]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# from feature_extractor import *
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import os
import pickle

# from cuml.svm import SVC

In [2]:
train_df = pd.read_csv("../emnist/emnist-balanced-train.csv", header=None)
test_df = pd.read_csv("../emnist/emnist-balanced-test.csv", header=None)

# Add test dataset into train dataset
df = pd.concat([train_df, test_df], ignore_index=True)
df.head()

del train_df
del test_df

In [3]:

#Reading the file
label_map = pd.read_csv("../emnist/emnist-balanced-mapping.txt",
                        delimiter = ' ',
                        index_col=0,
                        header=None)
label_map = label_map.iloc[:, 0]


In [4]:
#Initialising an empty dictionary
label_dictionary = {}

#Running a loop for ASCII equivalent to character conversion
for index, label in enumerate(label_map):
    label_dictionary[index] = chr(label)

print(label_dictionary)


{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z', 36: 'a', 37: 'b', 38: 'd', 39: 'e', 40: 'f', 41: 'g', 42: 'h', 43: 'n', 44: 'q', 45: 'r', 46: 't'}


In [5]:
print(df.shape)
print(df.describe())

(131600, 785)


                 0         1         2         3         4              5    \
count  131600.000000  131600.0  131600.0  131600.0  131600.0  131600.000000   
mean       23.000000       0.0       0.0       0.0       0.0       0.002036   
std        13.564712       0.0       0.0       0.0       0.0       0.295477   
min         0.000000       0.0       0.0       0.0       0.0       0.000000   
25%        11.000000       0.0       0.0       0.0       0.0       0.000000   
50%        23.000000       0.0       0.0       0.0       0.0       0.000000   
75%        35.000000       0.0       0.0       0.0       0.0       0.000000   
max        46.000000       0.0       0.0       0.0       0.0      63.000000   

                 6              7              8              9    ...  \
count  131600.000000  131600.000000  131600.000000  131600.000000  ...   
mean        0.012728       0.019354       0.016284       0.017447  ...   
std         1.312534       1.586618       1.521459       1.692130 

In [6]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,36,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,43,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131595,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131596,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131597,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131598,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_nums = df[df[0] <= 9]
df_lletres = df[df[0] >= 10]

In [8]:
X_nums = df_nums.iloc[:,1:]
y_nums = df_nums.iloc[:,0]
train_x_nums, test_x_nums, train_y_nums, test_y_nums = train_test_split(X_nums, y_nums, test_size=0.2, random_state=42, shuffle=True)
X_lletres = df_lletres.iloc[:,1:]
y_lletres = df_lletres.iloc[:,0]
train_x_lletres, test_x_lletres, train_y_lletres, test_y_lletres = train_test_split(X_lletres, y_lletres, test_size=0.2, random_state=42, shuffle=True)

In [9]:
resh = lambda x: np.array(x).reshape(28, 28)
train_x_nums_2d = train_x_nums.apply(resh, axis=1)
test_x_nums_2d = test_x_nums.apply(resh, axis=1)
train_x_lletres_2d = train_x_lletres.apply(resh, axis=1)
test_x_lletres_2d = test_x_lletres.apply(resh, axis=1)

In [10]:
print(train_x_nums.shape)
print(train_x_lletres.shape)
print(pd.concat([train_x_lletres,train_x_nums]).shape)

(22400, 784)
(82880, 784)
(105280, 784)


In [42]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
output_lletres = pd.DataFrame(columns=['#fold', 'accuracy_score','f1_score','precision_score','recall_score'])
for i, (train_index, test_index) in enumerate(kfold.split(df_nums)):
    print("Fold #", i)
    X_train, X_test = df_nums.iloc[train_index,1:], df_nums.iloc[test_index,1:]
    y_train, y_test = df_nums.iloc[train_index,0], df_nums.iloc[test_index,0]
    model = SVC(kernel='linear', C=1, random_state=42)
    model.fit(X_train, y_train)
    print("fitted!!")
    y_pred = model.predict(X_test)
    print("predicted!!")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred, average='macro'))
    print("Precision:", precision_score(y_test, y_pred, average='macro'))
    print("Recall:", recall_score(y_test, y_pred, average='macro'))
    output_lletres = output_lletres.append({'#fold': i,
                                            'accuracy_score': accuracy_score(y_test, y_pred),
                                            'f1_score': f1_score(y_test, y_pred, average='macro'),
                                            'precision_score': precision_score(y_test, y_pred, average='macro'),
                                            'recall_score': recall_score(y_test, y_pred, average='macro')}, ignore_index=True)

Fold # 0


In [39]:
SVClassifier_nums_1d = SVC(kernel='linear', verbose=True, random_state=42, C=1.0)
# SVClassifier_nums_1d.fit(train_x_nums,train_y_nums)
cv_nums = cross_val_score(SVClassifier_nums_1d, train_x_nums, train_y_nums, cv=5,scoring=['accuracy','f1_score','precision','recall'])


InvalidParameterError: The 'scoring' parameter of check_scoring must be a str among {'recall', 'precision_weighted', 'jaccard_samples', 'neg_mean_absolute_percentage_error', 'average_precision', 'f1_samples', 'neg_mean_gamma_deviance', 'precision_micro', 'max_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'r2', 'recall_micro', 'neg_brier_score', 'balanced_accuracy', 'precision', 'recall_macro', 'neg_root_mean_squared_error', 'f1_micro', 'roc_auc_ovo_weighted', 'neg_log_loss', 'roc_auc', 'v_measure_score', 'jaccard_macro', 'adjusted_mutual_info_score', 'positive_likelihood_ratio', 'recall_samples', 'adjusted_rand_score', 'f1_weighted', 'matthews_corrcoef', 'rand_score', 'mutual_info_score', 'neg_mean_squared_log_error', 'explained_variance', 'f1', 'top_k_accuracy', 'neg_mean_poisson_deviance', 'accuracy', 'recall_weighted', 'roc_auc_ovo', 'neg_mean_squared_error', 'roc_auc_ovr_weighted', 'neg_mean_absolute_error', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'completeness_score', 'jaccard_micro', 'neg_negative_likelihood_ratio', 'roc_auc_ovr', 'precision_samples', 'jaccard_weighted', 'f1_macro', 'precision_macro'}, a callable or None. Got ['accuracy', 'f1_score', 'precision', 'recall'] instead.

In [37]:
cv_nums

array([0.92142857, 0.92834821, 0.92745536, 0.925     , 0.92723214])

In [38]:
SVC_pred_nums_1d = SVClassifier_nums_1d.predict(test_x_nums)
print(classification_report(test_y_nums, SVC_pred_nums_1d))

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
train_x_nums.shape

(22400, 784)

In [14]:
# resampling amb 10 classes
samples = list()
for i in range(df_lletres[0].max()):
    if i < 9: continue
    samples.append(df_lletres[df_lletres[0]==i].sample(frac=0.11, random_state=42))

sample_lletres = pd.concat(samples)
train_y_lletres = sample_lletres.iloc[:,0]
train_x_lletres = sample_lletres.iloc[:,1:]
print(train_x_nums.shape)
print(train_x_lletres.shape)
# print(train_y_lletres.iloc[0])
print(train_y_lletres.value_counts())

(22400, 784)
(11088, 784)
10    308
11    308
30    308
31    308
32    308
33    308
34    308
35    308
36    308
37    308
38    308
39    308
40    308
41    308
42    308
43    308
44    308
29    308
28    308
27    308
18    308
12    308
13    308
14    308
15    308
16    308
17    308
19    308
26    308
20    308
21    308
22    308
23    308
24    308
25    308
45    308
Name: 0, dtype: int64


In [15]:
sample_lletres.columns

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            775, 776, 777, 778, 779, 780, 781, 782, 783, 784],
           dtype='int64', length=785)

In [16]:

SVClassifier_lletres_1d = SVC(kernel='linear', verbose=True, random_state=42, C=1.0)
SVClassifier_lletres_1d.fit(train_x_lletres, train_y_lletres)

[LibSVM]

In [17]:
SVC_pred_lletres_1d = SVClassifier_lletres_1d.predict(test_x_lletres)
print(classification_report(test_y_lletres, SVC_pred_lletres_1d))

              precision    recall  f1-score   support

          10       0.68      0.77      0.72       559
          11       0.77      0.82      0.80       575
          12       0.67      0.82      0.74       531
          13       0.74      0.84      0.79       517
          14       0.77      0.78      0.77       557
          15       0.44      0.56      0.50       578
          16       0.78      0.81      0.80       545
          17       0.71      0.75      0.73       586
          18       0.51      0.63      0.56       555
          19       0.73      0.78      0.76       564
          20       0.68      0.72      0.70       578
          21       0.56      0.59      0.58       572
          22       0.84      0.87      0.86       526
          23       0.74      0.75      0.75       581
          24       0.83      0.83      0.83       573
          25       0.77      0.78      0.78       541
          26       0.77      0.72      0.74       530
          27       0.76    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
with open('../svm_img_model_numbers.pkl', 'wb') as model_file:
    pickle.dump(SVClassifier_nums_1d, model_file)

with open('../svm_img_model_letters.pkl', 'wb') as model_file:
    pickle.dump(SVClassifier_lletres_1d, model_file)