In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
import gensim
from gensim.models import Word2Vec
import numpy as np
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import pickle

warnings.filterwarnings(action = 'ignore')

In [2]:
df=pd.read_csv('input9.csv')

In [3]:
df.head()

Unnamed: 0,comment,class
0,FIXME formatters are not thread safe,DEFECT
1,"XXX Jon Skeet The comment ""if it hasn't bee...",DEFECT
2,"I hate to admit it, but we don't know what h...",DEFECT
3,Just a note StarTeam has a status for NEW wh...,DEFECT
4,the generated classes must not be added in t...,DEFECT


In [4]:
df['class'].value_counts()

WITHOUT_CLASSIFICATION    58204
DESIGN                     2703
IMPLEMENTATION              757
DEFECT                      472
TEST                         85
DOCUMENTATION                54
Name: class, dtype: int64

In [5]:
df['class']=df['class'].apply(lambda x: 0 if x=='WITHOUT_CLASSIFICATION' else 1)

In [6]:
df['class'].value_counts()

0    58204
1     4071
Name: class, dtype: int64

In [7]:
X = df['comment']
Y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [8]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(X_train).toarray()
final_features.shape

(49820, 16540)

In [9]:
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', LogisticRegression(random_state=42, class_weight='balanced'))])

model = pipeline.fit(X_train, y_train)
with open('LogisticRegression_Imbalanced.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test), digits=6))
print(confusion_matrix(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

           0   0.992487  0.962290  0.977155     11668
           1   0.614711  0.891995  0.727838       787

    accuracy                       0.957848     12455
   macro avg   0.803599  0.927142  0.852497     12455
weighted avg   0.968616  0.957848  0.961401     12455

[[11228   440]
 [   85   702]]
