In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
import sklearn.svm as svm
from sklearn.svm import LinearSVC
import sklearn.metrics as mt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

### tf-idf vectorizer generation part 

In [2]:
# I did not use the target directory so need to add it after unifying the path of the files in text pre-processing.ipynb file.

# training data
clean_train = []

for line in open("train.txt", "r"):
    clean_train.append(line.strip().split(','))

for n in range(0,len(clean_train)):
    clean_train[n] = clean_train[n][1:]
    
# testing data
clean_test = []

for line in open("test.txt", "r"):
    clean_test.append(line.strip().split(','))

for n in range(0,len(clean_test)):
    clean_test[n] = clean_test[n][1:]

In [3]:
# training data
train_df = pd.DataFrame(clean_train)
train_df['Words'] = train_df[train_df.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
train_df.drop(train_df.iloc[:,1:281], inplace=True, axis=1)
train_df=train_df.rename(columns = {0:'Label'})

In [4]:
train_df

Unnamed: 0,Label,Words
0,cs,"save,special,case,current,training,method,gene..."
1,math.DS,"consider,dynamical,system,finitely,many,equili..."
2,cs,"consider,discrete,dynamical,system,ant,like,ag..."
3,cs,"retrofit,technique,inject,external,resource,wo..."
4,cs,"approach,decision,make,uncertainty,belief,func..."
...,...,...
29633,cs,"powerful,deep,network,architecture,generative,..."
29634,math.AG,"develop,mixed,characteristic,version,mori,muka..."
29635,cs,"complex,analysis,wind,number,measure,number,ti..."
29636,cs,"discus,secure,computation,modular,sum,multiple..."


In [5]:
train_words = train_df.Words.values.tolist()
train_label = train_df.Label.values.tolist()

In [24]:
# testing data
test_df = pd.DataFrame(clean_test)
test_df['Words'] = test_df[test_df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
test_df.drop(test_df.iloc[:,0:280], inplace=True, axis=1)
test_df["Label"] = np.nan

In [25]:
test_df

Unnamed: 0,Words,Label
0,"method,model,average,become,important,tool,dea...",
1,"unmanned,aerial,vehicle,uav,system,increasingl...",
2,"paper,propose,new,loss,function,call,generaliz...",
3,"show,integrate,weak,morphism,lie,algebra,cross...",
4,"caustic,occur,widely,dynamic,take,shape,classi...",
...,...,...
7405,"statistical,inference,evolutionary,parameter,m...",
7406,"present,deep,learn,framework,base,generative,a...",
7407,"cell,receptor,tcr,repertoire,data,contain,info...",
7408,"paper,provide,modern,synthesis,classic,inverse...",


In [26]:
test_words = test_df.Words.values.tolist()

- Training & Valid sets

In [9]:
seed = 123
test_size = 0.2
x_train, x_valid, y_train, y_valid = train_test_split(train_words, train_label, test_size=test_size, random_state=seed)

* TF-IDF with n-gram

In [10]:
# https://stackoverflow.com/questions/45883679/train-model-fails-because-list-object-has-no-attribute-lower

def dummy(doc):
    return doc

tfidfv = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy,ngram_range=(1,3))

# training
train_tfidf = tfidfv.fit_transform(x_train)
train_target = np.asarray(y_train)

#valid
valid_tfidf = tfidfv.transform(x_valid)
valid_target = np.asarray(y_valid)

# testing
test_tfidf = tfidfv.transform(test_words)

### Model

### 1. Linear SVC model

In [11]:
# https://stackoverflow.com/questions/18165213/how-much-time-does-take-train-svm-classifier
# to save computation time, we will use a linear svc model

In [12]:
#linear SVC model
linear_model = LinearSVC()

- train a model with a tuned Hyperparameter

In [19]:
# https://stackoverflow.com/questions/24121018/sklearn-gridsearch-how-to-print-out-progress-during-the-execution
# C and loss
parameters_svm = {'C':[0.8, 0.825, 0.85]}
grid_svm = GridSearchCV(LinearSVC(), parameters_svm, scoring= 'accuracy', cv=3, refit = True, verbose = 10)

#train a model
grid_svm.fit(train_tfidf, train_target)

print(grid_svm.best_params_)
print(grid_svm.best_estimator_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] C=0.8 ...........................................................
[CV] ............................... C=0.8, score=0.524, total=  55.3s
[CV] C=0.8 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   55.2s remaining:    0.0s


[CV] ............................... C=0.8, score=0.523, total= 1.4min
[CV] C=0.8 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s


[CV] ............................... C=0.8, score=0.529, total=  56.6s
[CV] C=0.825 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min remaining:    0.0s


[CV] ............................. C=0.825, score=0.524, total= 1.2min
[CV] C=0.825 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.5min remaining:    0.0s


[CV] ............................. C=0.825, score=0.523, total= 1.4min
[CV] C=0.825 .........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.9min remaining:    0.0s


[CV] ............................. C=0.825, score=0.530, total=  56.1s
[CV] C=0.85 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.9min remaining:    0.0s


[CV] .............................. C=0.85, score=0.523, total= 1.4min
[CV] C=0.85 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  8.2min remaining:    0.0s


[CV] .............................. C=0.85, score=0.522, total= 1.1min
[CV] C=0.85 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  9.4min remaining:    0.0s


[CV] .............................. C=0.85, score=0.530, total= 2.4min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 11.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 11.7min finished


{'C': 0.825}
LinearSVC(C=0.825, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)


- prediction on a valid dataset

In [20]:
# Make predictions
y_predictition = grid_svm.predict(valid_tfidf)

- performance metrics of the  model

In [21]:
recall = recall_score(valid_target, y_predictition, average='macro')
precision = precision_score(valid_target, y_predictition, average='macro')
f1score = f1_score(valid_target, y_predictition, average='macro')
accuracy = accuracy_score(valid_target, y_predictition)

print('Confusion Matrix:\n',confusion_matrix(valid_target, y_predictition),'\n')
print('Accuracy:', str(accuracy))
print('Precision:', str(precision))
print('Recall:', str(recall))
print('F1 score:', str(f1score))

Confusion Matrix:
 [[22  0  6 ...  0  0  0]
 [ 0  4  0 ...  0  0  0]
 [ 3  0  2 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  1]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  2  0  4]] 

Accuracy: 0.5334008097165992
Precision: 0.2570110398454902
Recall: 0.19713569655509228
F1 score: 0.21079322763378083


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


- prediction on a test dataset

In [22]:
test_target = grid_svm.predict(test_tfidf)

In [27]:
test_df["Label"] = test_target
test_df['test_id'] = list(range(1,len(test_df)+1))

del test_df['Words']
final = test_df[['test_id', 'Label']]

final

Unnamed: 0,test_id,Label
0,1,q-fin.EC
1,2,cs
2,3,cs
3,4,math.CT
4,5,cond-mat.quant-gas
...,...,...
7405,7406,q-bio.PE
7406,7407,cs
7407,7408,q-bio.QM
7408,7409,cs


In [None]:
final.to_csv('./Predictions_SVM.csv', index = False)