In [3]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import re
import pickle
import nltk
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [4]:
resumeDataSet = pd.read_csv('job_resumes.csv')
resumeDataSet.head()

Unnamed: 0.1,Unnamed: 0,Category,Resume,tokenized
0,0,6,skill program languag python panda numpi scipi...,"(0, 1495)\t0.024570713575492127\n (0, 1481)..."
1,1,6,educ detail may may uit rgpv data scientist da...,"(0, 1495)\t0.17273040581960264\n (0, 1481)\..."
2,2,6,area interest deep learn control system design...,"(0, 1495)\t0.15032447342486555\n (0, 1481)\..."
3,3,6,skill python sap hana tableau sap hana sql sap...,"(0, 1495)\t0.060220101707952216\n (0, 1487)..."
4,4,6,educ detail mca ymcaust faridabad haryana data...,"(0, 1495)\t0.37410537366072966\n (0, 1301)\..."


In [5]:
ps = PorterStemmer()
stopwords_list = nltk.corpus.stopwords.words('english')
def process_text(x):
    x = re.sub('[^a-zA-Z]', ' ', x).lower().split()
    x = ' '.join([ps.stem(word) for word in x if word not in stopwords_list])
    x = re.sub(r'http\S+\s*', ' ', x)
    x = re.sub('RT|cc', ' ', x)
    x = re.sub(r'#\S+', '', x)
    x = re.sub(r'@\S+', '  ', x)
    x = re.sub(r'[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', x)
    x = re.sub(r'[^\x00-\x7F]', r' ', x)
    x = re.sub(r'\b\w\b''', '', x)
    x = re.sub(r'\s+', ' ', x)
    return x

In [6]:
tfidf = TfidfVectorizer(sublinear_tf = True,max_features = 1500,ngram_range = (1,1),min_df = 0.01,max_df = 0.8)
X = tfidf.fit_transform(resumeDataSet['Resume'])
y = resumeDataSet['Category']

In [7]:
features = (tfidf.get_feature_names())

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20)

In [9]:
print('X_train shape :',X_train.shape)
print('y_train shape :',y_train.shape)
print('X_val shape :',X_val.shape)
print('y_val shape :',y_val.shape)

X_train shape : (769, 1500)
y_train shape : (769,)
X_val shape : (193, 1500)
y_val shape : (193,)


In [10]:
classifier = SVC()
classifier.fit(X_train, y_train)

SVC()

In [11]:
y_pred_svc = classifier.predict(X_val)

In [12]:
print('Accuracy of SVC Classifier: {:.2f}'.format(accuracy_score(y_val,y_pred_svc)))

Accuracy of SVC Classifier: 0.98


In [14]:
param_grid_svc = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}

In [15]:
grid_svc_acc = GridSearchCV(classifier, param_grid_svc)
grid_svc_acc.fit(X_train, y_train)
print(grid_svc_acc.best_params_)
y_pred_svc_acc = grid_svc_acc.predict(X_val)
print('Accuracy Score : ' , accuracy_score(y_val,y_pred_svc_acc))

{'C': 10, 'gamma': 0.1}
Accuracy Score :  0.9792746113989638


In [16]:
# saving the model
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(grid_svc_acc, f)