In [6]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
import gensim
from gensim.models import Word2Vec
import numpy as np
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings(action = 'ignore')

In [7]:
df=pd.read_csv('input10.csv')

In [8]:
df.head()

Unnamed: 0,comment,class
0,FIXME formatters are not thread safe,DEFECT
1,"XXX Jon Skeet The comment ""if it hasn't bee...",DEFECT
2,"I hate to admit it, but we don't know what h...",DEFECT
3,Just a note StarTeam has a status for NEW wh...,DEFECT
4,the generated classes must not be added in t...,DEFECT


In [9]:
df['class'].value_counts()

DESIGN            2703
IMPLEMENTATION     757
DEFECT             472
TEST                85
DOCUMENTATION       54
Name: class, dtype: int64

In [10]:
X = df['comment']
Y = df['class']
le = LabelEncoder()
Y = le.fit_transform(Y)

multiclass_labels = []
print("\nMulticlass Label Encodings (in order of digits 0 -> n): ")
for i in range(0, len(list(set(list(Y))))):
    multiclass_labels.append(le.inverse_transform([i])[0])

print(multiclass_labels)
print("\n Class weights:")
#print(np.unique(Y, return_counts = True))
weight_dict = {}
counts = np.unique(Y, return_counts = True)[1]
total = np.sum(counts)
for i in range (0, 5):
    weight_dict[i] = (total - counts[i]) / counts[i]

print(weight_dict)


Multiclass Label Encodings (in order of digits 0 -> n): 
['DEFECT', 'DESIGN', 'DOCUMENTATION', 'IMPLEMENTATION', 'TEST']

 Class weights:
{0: 7.625, 1: 0.5061043285238623, 2: 74.38888888888889, 3: 4.3778071334214, 4: 46.89411764705882}


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [13]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(X_train).toarray()
final_features.shape

(3256, 2901)

In [15]:
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', LogisticRegression(random_state=42, class_weight='balanced'))])

model = pipeline.fit(X_train, y_train)
with open('LogisticRegression_Imbalanced.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test), digits=6, target_names=multiclass_labels))
print(confusion_matrix(ytest, model.predict(X_test)))

                precision    recall  f1-score   support

        DEFECT   0.472727  0.571429  0.517413        91
        DESIGN   0.846154  0.760748  0.801181       535
 DOCUMENTATION   0.444444  0.857143  0.585366        14
IMPLEMENTATION   0.589595  0.662338  0.623853       154
          TEST   0.750000  0.857143  0.800000        21

      accuracy                       0.725153       815
     macro avg   0.620584  0.741760  0.665563       815
  weighted avg   0.746602  0.725153  0.732252       815

[[ 52  30   0   9   0]
 [ 49 407  14  61   4]
 [  0   1  12   1   0]
 [  8  41   1 102   2]
 [  1   2   0   0  18]]
