In [1]:
import numpy as np
import pandas as pd
import xlrd as xl
from pandas import ExcelWriter
from pandas import ExcelFile
import pickle, re, json, os, datetime, time

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [2]:
## Read the data from the pickle file
all_data = pd.read_pickle('/content/all_data.pkl')
print("Size of corpus: "+str(len(all_data)))

Size of corpus: 4330


In [3]:
## Get the set of all unique knowledge types in the corpus

knowledge_types = list(set(all_data['Code']))
print("Number of unique knowledge types: "+str(len(knowledge_types)))
knowledge_types.sort()

pp.pprint(knowledge_types)

Number of unique knowledge types: 13
[   'Action on Issue',
    'Bug Reproduction',
    'Contribution and Commitment',
    'Expected Behaviour',
    'Investigation and Exploration',
    'Motivation',
    'Observed Bug Behaviour',
    'Potential New Issues and Requests',
    'Social Conversation',
    'Solution Discussion',
    'Task Progress',
    'Usage',
    'Workarounds']


In [4]:
projects = ['tensorflow','scikit-learn','spaCy']
for proj in projects:
    print("Number of sentences from "+proj+"issues: "+str(len(all_data[all_data.Document.str.contains(proj)])))

Number of sentences from tensorflowissues: 2100
Number of sentences from scikit-learnissues: 1401
Number of sentences from spaCyissues: 829


In [5]:
all_data.iloc[0:3]

Unnamed: 0,Document,Text Content,Code,Full Length,len,tloc,cloc,tpos1,tpos2,clen,tlen,ppau,npau,aa,begauth,has_code,first_turn,last_turn
0,1 37_tensorflow.doc,Node.js (JavaScript) Wrapper API,Expected Behaviour,32,32,0.5,0.002294,0.0,1.0,1.0,0.055556,0.0,0.000465,NONE,True,False,True,False
1,1 37_tensorflow.doc,Because JavaScript is Awesome,Motivation,29,29,1.0,0.004587,0.0,1.0,1.0,0.055556,0.0,0.000465,NONE,True,False,True,False
2,1 37_tensorflow.doc,+1!,Social Conversation,3,3,1.0,0.006881,2.6e-05,0.999974,1.0,0.013889,0.000465,0.000916,NONE,False,False,False,False


Transformation on the data:
Drop Full Length
Convert begauth which contains values True and False to One Hot Encoding
Convert the time-based feature tpos2 to a numeric field.

In [6]:
## Drop "Full Length"
transformed_data = all_data[['Document','Text Content','Code','len','tloc','cloc','tpos1','tpos2','clen','tlen','ppau','npau','aa','begauth','has_code','first_turn','last_turn']]

# Convert "begauth" which contains values `True` and `False` to One Hot Encoding
transformed_data = pd.get_dummies(transformed_data,columns = ['begauth'])

# Convert the time-based feature "tpos2" to a numeric field.
transformed_data.tpos2.astype(int)

print('Done')

Done


In [8]:
transformed_data.iloc[0:1]

Unnamed: 0,Document,Text Content,Code,len,tloc,cloc,tpos1,tpos2,clen,tlen,ppau,npau,aa,has_code,first_turn,last_turn,begauth_False,begauth_True
0,1 37_tensorflow.doc,Node.js (JavaScript) Wrapper API,Expected Behaviour,32,0.5,0.002294,0.0,1.0,1.0,0.055556,0.0,0.000465,NONE,False,True,False,0,1


Notice that the field Full Length no longer exists and the field begauth has now been changed to begauth_False and begauth_True.



In [10]:
from sklearn.model_selection import train_test_split

X = transformed_data['Text Content']
y = transformed_data['Code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [27]:
X = transformed_data['Text Content']
y = transformed_data['Code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, shuffle=True)

vectorizer = CountVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

clf = LogisticRegression()
model = clf.fit(X_train_tf, y_train)
y_pred = clf.predict(X_test_tf)

print(classification_report(y_test, y_pred))

                                   precision    recall  f1-score   support

                  Action on Issue       0.67      0.29      0.40         7
                 Bug Reproduction       0.49      0.36      0.42        55
      Contribution and Commitment       0.50      0.24      0.32        17
               Expected Behaviour       0.20      0.06      0.09        18
    Investigation and Exploration       0.36      0.25      0.29        65
                       Motivation       0.45      0.21      0.29        61
           Observed Bug Behaviour       0.50      0.19      0.28        31
Potential New Issues and Requests       0.37      0.20      0.26        49
              Social Conversation       0.55      0.82      0.66       172
              Solution Discussion       0.57      0.72      0.64       291
                    Task Progress       0.31      0.24      0.27        21
                            Usage       0.48      0.45      0.46        64
                      Wo

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [31]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

from sklearn.metrics import accuracy_score

# Encode labels as integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=13)
y_test = to_categorical(y_test, num_classes=13)

# Create a sequence of tokens from the text data
max_sequence_length = 1000
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=50, input_length=max_sequence_length))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(units=13, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=10, batch_size=32)

# Evaluate the model on test data
y_pred = model.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
y_test_decoded = np.argmax(y_test, axis=1)
accuracy = accuracy_score(y_test_decoded, y_pred)
print('Test accuracy:', accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.4838337182448037


In [34]:
from sklearn.metrics import classification_report

# Convert one-hot encoded test labels back to integer labels
y_test_int = np.argmax(y_test, axis=1)

# Generate predictions for test data
y_pred = model.predict(X_test_pad)
y_pred_int = np.argmax(y_pred, axis=1)

# Print classification report
print(classification_report(y_test_int, y_pred_int))


              precision    recall  f1-score   support

           0       0.60      0.43      0.50         7
           1       0.43      0.44      0.43        55
           2       0.00      0.00      0.00        17
           3       0.07      0.11      0.09        18
           4       0.35      0.37      0.36        65
           5       0.26      0.16      0.20        61
           6       0.50      0.16      0.24        31
           7       0.22      0.18      0.20        49
           8       0.66      0.72      0.69       172
           9       0.58      0.64      0.61       291
          10       0.12      0.14      0.13        21
          11       0.41      0.45      0.43        64
          12       0.00      0.00      0.00        15

    accuracy                           0.48       866
   macro avg       0.32      0.29      0.30       866
weighted avg       0.47      0.48      0.47       866

