<a href="https://colab.research.google.com/github/SaguPandya96/Research-Project-Fall-2020/blob/master/ML_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Research Project**








**1. Import the libraries and functions and data.**





*   **Naive Bayes with Chromium bugs data**





In [None]:
'''
This file contains miscellaneous utilities for text processing.
'''
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def strip_punctuations(data, column_name='text'):
  '''
  Strips punctuations from the end of each token.
  This uses suggestion from https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate
  to accomplish this really fast.
  '''
  translator = str.maketrans('', '', string.punctuation)
  data['text'] = data['text'].map(lambda s : str(s).translate(translator))
  return data

def stemm_text(data, stemmer_choice='Lancaster'):
  '''
  Stemm the 'text' column of data - this simplifies the words so
  that different forms of the same word end up being the same.
  '''
  if stemmer_choice == 'Lancaster':
    stemmer = LancasterStemmer()
  elif stemmer_choice == 'Snowball':
    stemmer = SnowballStemmer('english')
  elif stemmer_choice == 'Porter':
    stemmer = PorterStemmer()
  else:
    raise Exception('Illegal stemmer_choice argument')
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: stemmer.stem(w), s.split())))
  return data

def remove_long_words(data, maxlen=16):
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if len(w) < maxlen else ' ', s.split())))
  return data

def remove_short_words(data, minlen=4):
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if len(w) >= minlen else ' ', s.split())))
  return data

def remove_linux_garbage(data):
  '''
  Linux data contains lots of garbage, e.g. memory addresses - 0000f800
  '''
  def is_garbage(w):
    return len(w) >= 7 and sum(c.isdigit() for c in w) >= 2

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if not is_garbage(w) else ' ', s.split())))
  return data

def cast_to_lowercase(data):
  data['text'] = data['text'].map(lambda s : s.lower())
  return data

def remove_stopwords(data):
  stop_words = stopwords.words('english')
  translator = str.maketrans('', '', string.punctuation)
  stop_words = set([w.translate(translator) for w in stop_words]) # Apostrophes were removed already

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if w not in stop_words else ' ', s.split())))
  return data

def remove_rare_words(data, min_count=3):
  wc = {} # WordCount
  def proc_word(s):
    for w in set(s.split()):
      if w in wc:
        wc[w] += 1
      else:
        wc[w] = 1

  for index, row in data.iterrows():
    proc_word(row['text'])

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if wc[w] >= min_count else ' ', s.split())))
  return data

import pandas as pd

LINUX_BUGS_DATA_PATH = '/content/drive/My Drive/Research Project/linux_bugs_usage_ready.csv'
CHROMIUM_BUGS_DATA_PATH = '/content/drive/My Drive/Research Project/chromium.csv'

def merge_title_and_message(data, message_col_name='message'):
  '''
  This function is specific to the linux bug tracker dataset. It contains two
  feature columns (with text) - `title` and `message`, this merges them into a
  single column called `text`
  '''
  data['text'] = data['title'] + ' ' + data[message_col_name]
  data = data.drop(['title'], axis=1)
  data = data.drop([message_col_name], axis=1)
  return data

def load_linux_bug_data():
  '''
  Load linux bugs dataset and apply the preprocessing pipeline.
  '''
  data = pd.read_csv(LINUX_BUGS_DATA_PATH, sep='\t')
  print(data)
  data = merge_title_and_message(data)
  print(data)
  data = strip_punctuations(data)
  print(data)
  # data = stemm_text(data) - this has shown poor results
  data = remove_linux_garbage(data)
  print(data)
  return data

def load_chromium_bug_data():
  '''
  Load chromium bugs dataset and apply the preprocessing pipeline.
  '''
  data = pd.read_csv(CHROMIUM_BUGS_DATA_PATH, sep='\t')  
  data = merge_title_and_message(data, message_col_name='description')
  data = strip_punctuations(data)
  return data

from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob.classifiers import DecisionTreeClassifier
from sklearn.metrics import f1_score

def naive_bayes_classify(data):
  class_to_predict = 'type' # product importance
  all_data = [tuple(x) for x in data[['text', class_to_predict]].to_records(index=False)]

  text_counts = {}
  for item in all_data:
    for word in set(item[0].split()):
      if word in text_counts:
        text_counts[word] += 1
      else:
        text_counts[word] = 1

  for i in range(len(all_data)):
    new_text = ''
    for word in all_data[i][0].split():
      if text_counts[word] >= 5:
        new_text += ' ' + word
    all_data[i] = (new_text, all_data[i][1])

  print('Finished preprocessing!')

  test_corpus = all_data[3000:3600]
  training_corpus = all_data[:3400]

  model = NBC(training_corpus, verbose=True)
  print('Done training!')
  print('Accuracy: ' + str(model.accuracy(test_corpus)))

  y_pred = []
  y_true = []
  for test_item in test_corpus:
    y_pred.append(model.prob_classify(test_item[0]).max())
    y_true.append(test_item[1])
  
  print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))

if __name__ == '__main__':
  print('Loading data!')
  data = load_chromium_bug_data()
  print('Classifying with NaiveBayes!')
  naive_bayes_classify(data)
  print('Done!')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Loading data!
Classifying with NaiveBayes!
Finished preprocessing!
Done training!
Accuracy: 0.74
F1 score: 0.736676676547138
Done!




*   **TF-IDF WITH SVM Chromium bugs data**



In [None]:
import itertools
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
import numpy as np


def tfidf_classify(data, model_type='SVM', extra_params={'min_df': 0.001}):
  '''
  data is a pandas dataframe
  '''
  class_to_predict = 'type' # product importance
  data = shuffle(data, random_state=77)

  num_records = len(data)
  data_train = data[:int(0.85 * num_records)]
  data_test = data[int(0.85 * num_records):]

  train_data = [x[0] for x in data_train[['text']].to_records(index=False)]
  train_labels = [x[0] for x in data_train[[class_to_predict]].to_records(index=False)]

  test_data = [x[0] for x in data_test[['text']].to_records(index=False)]
  test_labels = [x[0] for x in data_test[[class_to_predict]].to_records(index=False)]

  # Create feature vectors 
  vectorizer = TfidfVectorizer(**extra_params)
  # Train the feature vectors
  train_vectors = vectorizer.fit_transform(train_data)
  test_vectors = vectorizer.transform(test_data)

  # Perform classification with SVM, kernel=linear 
  if model_type == 'SVM':
    model = svm.SVC(kernel='linear')
  elif model_type == 'NN':
    model = MLPClassifier(hidden_layer_sizes=(200, 200), max_iter=4000)
  print('Training the model!')
  model.fit(train_vectors, train_labels) 
  train_prediction = model.predict(train_vectors)
  test_prediction = model.predict(test_vectors)

  train_accuracy = np.sum((np.array(train_labels) == np.array(train_prediction))) * 1.0 / len(train_labels)
  print('Training accuracy: ' + str(train_accuracy))

  test_accuracy = np.sum((np.array(test_labels) == np.array(test_prediction))) * 1.0 / len(test_labels)
  print('Test accuracy: ' + str(test_accuracy))

  print('F1 score: ' + str(f1_score(test_labels, test_prediction, average='weighted')))

  return test_accuracy

if __name__ == '__main__':
  print('Loading data!')
  data = load_chromium_bug_data()

  # Additional steps of the pipeline - FastText can do it by itself, or it doesn't help it
  data = cast_to_lowercase(data)
  data = remove_stopwords(data)
  data = remove_rare_words(data, min_count=3)

  print('Classifying with TFIDF-based approach!')

  tfidf_classify(data, model_type='SVM')
  print('Done!')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Loading data!
Classifying with TFIDF-based approach!
Training the model!
Training accuracy: 0.8385691446842526
Test accuracy: 0.7997961725738875
F1 score: 0.7992444649303668
Done!




*   **Fast text with Chromium bugs data**



In [None]:
!wget https://github.com/facebookresearch/fastText/archive/0.2.0.zip
!unzip 0.2.0.zip
%cd fastText-0.2.0
!make

--2020-09-03 01:19:19--  https://github.com/facebookresearch/fastText/archive/0.2.0.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/0.2.0 [following]
--2020-09-03 01:19:19--  https://codeload.github.com/facebookresearch/fastText/zip/0.2.0
Resolving codeload.github.com (codeload.github.com)... 140.82.114.9
Connecting to codeload.github.com (codeload.github.com)|140.82.114.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘0.2.0.zip’

0.2.0.zip               [  <=>               ]   4.10M  15.7MB/s    in 0.3s    

2020-09-03 01:19:20 (15.7 MB/s) - ‘0.2.0.zip’ saved [4304799]

Archive:  0.2.0.zip
7842495a4d64c7a3bb4339d45d6e64321d002ed8
   creating: fastText-0.2.0/
   creating: fastText-0.2.0/.circleci/
  inflating: fastText-0.2.0/.

In [None]:
!git clone https://github.com/facebookresearch/fastText.git

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.22 MiB | 29.55 MiB/s, done.
Resolving deltas: 100% (2419/2419), done.


In [None]:
%cd fastText
!pip install .

/content/fastText-0.2.0/fastText
Processing /content/fastText-0.2.0/fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3013270 sha256=c94d698b23d811f12da79b409aac2e3eb33ef39ce51065571c32a0dcefa2cb2a
  Stored in directory: /tmp/pip-ephem-wheel-cache-jtppq0r8/wheels/bd/b4/45/ba7d23789b3bf6bb213208004c2aa23a6405031a0c99a19e1e
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [None]:
'''
This file contains miscellaneous utilities for text processing.
'''
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')

def strip_punctuations(data, column_name='text'):
  '''
  Strips punctuations from the end of each token.
  This uses suggestion from https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate
  to accomplish this really fast.
  '''
  translator = str.maketrans('', '', string.punctuation)
  data['text'] = data['text'].map(lambda s : str(s).translate(translator))
  return data

def stemm_text(data, stemmer_choice='Lancaster'):
  '''
  Stemm the 'text' column of data - this simplifies the words so
  that different forms of the same word end up being the same.
  '''
  if stemmer_choice == 'Lancaster':
    stemmer = LancasterStemmer()
  elif stemmer_choice == 'Snowball':
    stemmer = SnowballStemmer('english')
  elif stemmer_choice == 'Porter':
    stemmer = PorterStemmer()
  else:
    raise Exception('Illegal stemmer_choice argument')
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: stemmer.stem(w), s.split())))
  return data

def remove_long_words(data, maxlen=16):
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if len(w) < maxlen else ' ', s.split())))
  return data

def remove_short_words(data, minlen=4):
  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if len(w) >= minlen else ' ', s.split())))
  return data

def remove_linux_garbage(data):
  '''
  Linux data contains lots of garbage, e.g. memory addresses - 0000f800
  '''
  def is_garbage(w):
    return len(w) >= 7 and sum(c.isdigit() for c in w) >= 2

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if not is_garbage(w) else ' ', s.split())))
  return data

def cast_to_lowercase(data):
  data['text'] = data['text'].map(lambda s : s.lower())
  return data

def remove_stopwords(data):
  stop_words = stopwords.words('english')
  translator = str.maketrans('', '', string.punctuation)
  stop_words = set([w.translate(translator) for w in stop_words]) # Apostrophes were removed already

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if w not in stop_words else ' ', s.split())))
  return data

def remove_rare_words(data, min_count=3):
  wc = {} # WordCount
  def proc_word(s):
    for w in set(s.split()):
      if w in wc:
        wc[w] += 1
      else:
        wc[w] = 1

  for index, row in data.iterrows():
    proc_word(row['text'])

  data['text'] = data['text'].map(lambda s : ' '.join(map(lambda w: w if wc[w] >= min_count else ' ', s.split())))
  return data

import pandas as pd

LINUX_BUGS_DATA_PATH = '/content/drive/My Drive/Research Project/linux_bugs_usage_ready.csv'
CHROMIUM_BUGS_DATA_PATH = '/content/drive/My Drive/Research Project/chromium.csv'

def merge_title_and_message(data, message_col_name='message'):
  '''
  This function is specific to the linux bug tracker dataset. It contains two
  feature columns (with text) - `title` and `message`, this merges them into a
  single column called `text`
  '''
  data['text'] = data['title'] + ' ' + data[message_col_name]
  data = data.drop(['title'], axis=1)
  data = data.drop([message_col_name], axis=1)
  return data

def load_linux_bug_data():
  '''
  Load linux bugs dataset and apply the preprocessing pipeline.
  '''
  data = pd.read_csv(LINUX_BUGS_DATA_PATH, sep='\t')
  data = merge_title_and_message(data)
  data = strip_punctuations(data)
  # data = stemm_text(data) - this has shown poor results
  data = remove_linux_garbage(data)
  return data

def load_chromium_bug_data():
  '''
  Load chromium bugs dataset and apply the preprocessing pipeline.
  '''
  data = pd.read_csv(CHROMIUM_BUGS_DATA_PATH, sep='\t')
  data = merge_title_and_message(data, message_col_name='description')
  data = strip_punctuations(data)
  return data
  
import itertools
import nltk
nltk.download('stopwords')


'''
FastText requires specific training file format - see
https://github.com/facebookresearch/fastText for details.
'''

import fasttext
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

TRAIN_PATH = './fasttext_train.txt'
TEST_PATH = './fasttext_test.txt'
MODEL_PATH = './ft.model'

def fasttext_classify(data, extra_params={}):
  class_to_predict = 'type' # product importance
  data[class_to_predict] = data[class_to_predict].map(lambda s : s.replace(" ", ""))
  data_for_fasttext = data['text'] + ' __label__' + data[class_to_predict]
  data_for_fasttext = shuffle(data_for_fasttext, random_state=77)

  num_records = len(data_for_fasttext)
  data_train = data_for_fasttext[:int(0.85 * num_records)]
  data_test = data_for_fasttext[int(0.85 * num_records):]

  data_train.to_csv(TRAIN_PATH, sep='\t', header=0, index=False)
  data_test.to_csv(TEST_PATH, sep='\t', header=0, index=False)

  model = fasttext.train_supervised(TRAIN_PATH, **extra_params)
  #model.save_model(MODEL_PATH)
  print('Training accuracy:')
  train_accuracy = model.test(TRAIN_PATH)
  print(train_accuracy[-1])

  print('Test accuracy:')
  test_accuracy = model.test(TEST_PATH)
  print(test_accuracy[-1])

  y_pred = []
  y_true = []
  for test_item in data_test:
    test_text, test_label = test_item.split('__label__')
    y_pred.append(model.predict(test_text)[0])
    y_true.append('__label__' + test_label)

  print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))

  return test_accuracy[-1] # accuracy is a tuple

if __name__ == '__main__':
  print('Loading data!')
  data = load_chromium_bug_data()
  print('Classifying with FastText!')
  # These are the optimal parameters for the 'importance' prediction
  # For other columns they are different.
  fasttext_classify(data, extra_params={
    'epoch': 15,
    'minCount': 1,
    'dim': 150,
    'ws': 5,
    'neg': 5,
    'wordNgrams': 2,
    'verbose': 2
  })

  print('Done!')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Loading data!
Classifying with FastText!
Training accuracy:
0.996123101518785
Test accuracy:
0.8172347412524062
F1 score: 0.8169721005635936
Done!
