In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [2]:
# using original train dataset
train_df = pd.read_csv("wordEmbeddingTrain.csv")
test_df = pd.read_csv("wordEmbeddingTest.csv")

In [3]:
print('Shape of the train dataset: ' + str(train_df.shape))
del train_df['index']
train_df.head()

Shape of the train dataset: (29635, 102)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,labels
0,-0.140486,-0.708271,-0.382102,-0.410084,-0.203693,0.10579,0.197683,0.158858,-0.004087,0.501946,...,-0.3221,-1.064298,-0.205313,-0.604041,0.209205,-0.325033,0.153971,0.497229,0.458049,cs
1,0.411015,-0.790592,-0.897963,-0.338375,-0.284312,0.302484,-0.070416,-0.069028,5.6e-05,0.582238,...,-0.146003,-0.772914,0.038109,-0.847527,0.188956,-1.158004,0.093777,0.500449,0.18248,math.DS
2,-0.274633,-1.035805,-0.625783,-0.478151,0.26216,0.35161,0.210195,0.236449,0.049997,0.170583,...,0.507972,-1.007552,-0.483059,-0.487506,0.429888,-0.885947,-0.069881,0.263815,0.319821,cs
3,-0.005348,-0.107721,0.211413,0.022911,-0.310092,0.690397,-0.191305,-0.050948,0.295349,0.350828,...,0.594372,-0.1793,-0.217202,-0.343062,0.777707,0.067506,0.594999,0.238857,0.295559,cs
4,0.33193,-0.507421,-0.363805,-0.526135,0.168097,0.475303,-0.427657,0.082572,0.265829,0.375759,...,0.391183,-0.473992,-0.101979,-0.329689,-0.091322,-0.407838,-0.014392,0.398557,0.600941,cs


In [4]:
#Creating the dependent variable class
factor = pd.factorize(train_df['labels'])
train_df.labels = factor[0]
definitions = factor[1]

print(len(definitions), '\n')
print(train_df.labels)

100 

0         0
1         1
2         0
3         0
4         0
         ..
29630     0
29631    14
29632     0
29633     0
29634    14
Name: labels, Length: 29635, dtype: int64


In [5]:
#Splitting the data into independent and dependent variables

X = train_df.iloc[:,0:100].values
y = train_df.iloc[:,100].values

In [6]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

In [7]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
#linear SVC model
# https://stackoverflow.com/questions/18165213/how-much-time-does-take-train-svm-classifier
# to save computation time, we will use a linear svc model
linear_model = LinearSVC()

In [9]:
# hyperparameter tuning
# https://stackoverflow.com/questions/24121018/sklearn-gridsearch-how-to-print-out-progress-during-the-execution
# https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati
    
# C and loss
parameters_svm = {'C':[0.1, 0.2, 0.3 ,0.4, 0.5]}
grid_svm = GridSearchCV(LinearSVC(dual=False), parameters_svm, scoring= 'accuracy', cv=3, refit = True, verbose = 10)

#train a model
grid_svm.fit(X_train, y_train)

print(grid_svm.best_params_)
print(grid_svm.best_estimator_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=0.1, score=0.518, total=  24.9s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.8s remaining:    0.0s


[CV] ............................... C=0.1, score=0.514, total=  25.1s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   49.9s remaining:    0.0s


[CV] ............................... C=0.1, score=0.526, total=  24.1s
[CV] C=0.2 ...........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s


[CV] ............................... C=0.2, score=0.518, total=  24.3s
[CV] C=0.2 ...........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.6min remaining:    0.0s


[CV] ............................... C=0.2, score=0.513, total=  25.2s
[CV] C=0.2 ...........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s


[CV] ............................... C=0.2, score=0.525, total=  24.1s
[CV] C=0.3 ...........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.5min remaining:    0.0s


[CV] ............................... C=0.3, score=0.517, total=  24.4s
[CV] C=0.3 ...........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.9min remaining:    0.0s


[CV] ............................... C=0.3, score=0.511, total=  25.7s
[CV] C=0.3 ...........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.3min remaining:    0.0s


[CV] ............................... C=0.3, score=0.524, total=  31.9s
[CV] C=0.4 ...........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.8min remaining:    0.0s


[CV] ............................... C=0.4, score=0.516, total=  24.7s
[CV] C=0.4 ...........................................................
[CV] ............................... C=0.4, score=0.511, total=  25.4s
[CV] C=0.4 ...........................................................
[CV] ............................... C=0.4, score=0.523, total=  24.2s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.515, total=  24.5s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.509, total=  25.1s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.522, total=  24.2s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  6.3min finished


{'C': 0.1}
LinearSVC(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)


In [10]:
# Predicting the Test set results
y_predictition = grid_svm.predict(X_test)

In [11]:
# Performance metrics
recall = recall_score(y_test, y_predictition, average='macro')
precision = precision_score(y_test, y_predictition, average='macro')
f1score = f1_score(y_test, y_predictition, average='macro')
accuracy = accuracy_score(y_test, y_predictition)

print('Confusion Matrix:\n',confusion_matrix(y_test, y_predictition),'\n')
print('Accuracy:', str(accuracy))
print('Precision:', str(precision))
print('Recall:', str(recall))
print('F1 score:', str(f1score))

Confusion Matrix:
 [[1825    1    1 ...    0    0    0]
 [  21   28    0 ...    0    0    0]
 [ 138    0    0 ...    0    0    0]
 ...
 [   0    0    0 ...    0    0    0]
 [   3    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    0]] 

Accuracy: 0.5255609920701872
Precision: 0.247388974339324
Recall: 0.18189308628185377
F1 score: 0.1889687027527809


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [12]:
# test
test_data = test_df.iloc[:,0:100].values
test_data_scaled = scaler.fit_transform(test_data)
test_predictition = grid_svm.predict(test_data_scaled)

In [13]:
test_df=test_df.rename(columns = {'index':'test_id'})
test_df['label'] = definitions[test_predictition] # convert to original labels
test_df.drop(train_df.iloc[:,0:100], inplace=True, axis=1)

In [14]:
test_df.head()

Unnamed: 0,test_id,label
0,1,q-fin.EC
1,2,cs
2,3,cs
3,4,math.AG
4,5,physics.optics


In [15]:
test_df.to_csv('./Predictions_SVM.csv', index = False)