In [8]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
import gensim
from gensim.models import Word2Vec
import numpy as np
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings(action = 'ignore')

from sklearn.svm import SVC

In [2]:
df=pd.read_csv('input10.csv')

In [3]:
df.head()

Unnamed: 0,comment,class
0,FIXME formatters are not thread safe,DEFECT
1,"XXX Jon Skeet The comment ""if it hasn't bee...",DEFECT
2,"I hate to admit it, but we don't know what h...",DEFECT
3,Just a note StarTeam has a status for NEW wh...,DEFECT
4,the generated classes must not be added in t...,DEFECT


In [4]:
df['class'].value_counts()

DESIGN            2703
IMPLEMENTATION     757
DEFECT             472
TEST                85
DOCUMENTATION       54
Name: class, dtype: int64

In [5]:
X = df['comment']
Y = df['class']
le = LabelEncoder()
Y = le.fit_transform(Y)

multiclass_labels = []
print("\nMulticlass Label Encodings (in order of digits 0 -> n): ")
for i in range(0, len(list(set(list(Y))))):
    multiclass_labels.append(le.inverse_transform([i])[0])

print(multiclass_labels)
print("\n Class weights:")
#print(np.unique(Y, return_counts = True))
weight_dict = {}
counts = np.unique(Y, return_counts = True)[1]
total = np.sum(counts)
for i in range (0, 5):
    weight_dict[i] = (total - counts[i]) / counts[i]

print(weight_dict)


Multiclass Label Encodings (in order of digits 0 -> n): 
['DEFECT', 'DESIGN', 'DOCUMENTATION', 'IMPLEMENTATION', 'TEST']

 Class weights:
{0: 7.625, 1: 0.5061043285238623, 2: 74.38888888888889, 3: 4.3778071334214, 4: 46.89411764705882}


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [7]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(X_train).toarray()
final_features.shape

(3256, 2946)

In [9]:
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', SVC(random_state=42, class_weight='balanced'))])

model = pipeline.fit(X_train, y_train)
with open('SVC_mult', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test), digits=6, target_names=multiclass_labels))
print(confusion_matrix(ytest, model.predict(X_test)))

                precision    recall  f1-score   support

        DEFECT   0.513274  0.651685  0.574257        89
        DESIGN   0.842004  0.816822  0.829222       535
 DOCUMENTATION   0.666667  0.400000  0.500000        10
IMPLEMENTATION   0.613095  0.624242  0.618619       165
          TEST   0.777778  0.437500  0.560000        16

      accuracy                       0.747239       815
     macro avg   0.682564  0.586050  0.616420       815
  weighted avg   0.756350  0.747239  0.749417       815

[[ 58  23   0   7   1]
 [ 42 437   2  53   1]
 [  1   2   4   3   0]
 [  9  53   0 103   0]
 [  3   4   0   2   7]]
