## Phraseology Features

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Word Count, Sentence Count, Word count in a Sentence

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
import re
import numpy as np

def word_count(document):

  tokens = word_tokenize(document)

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  return len(filtered)

def sentence_count(document):

  tokens = sent_tokenize(document)

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  return len(filtered)

def paragraph_count(document):

  tokens = document.splitlines()

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  return len(filtered)

def word_count_sent(document):

  tokens = sent_tokenize(document)

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  word_counts = [word_count(sent) for sent in filtered]

  if len(word_counts) ==0:

    return 0, 0

  mean = sum(word_counts) / len(word_counts)
  variance = sum([((x - mean) ** 2) for x in word_counts]) / len(word_counts)
  res = variance ** 0.5

  return mean, res

def word_count_para(document):

  tokens = document.splitlines()

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  word_counts = [word_count(para) for para in filtered]

  if len(word_counts) ==0:

    return 0, 0

  mean = sum(word_counts) / len(word_counts)
  variance = sum([((x - mean) ** 2) for x in word_counts]) / len(word_counts)
  res = variance ** 0.5

  return mean, res

def sent_count_para(document):

  tokens = document.splitlines()

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  sent_counts = [sentence_count(para) for para in filtered]

  if len(sent_counts) ==0:

    return 0, 0

  mean = sum(sent_counts) / len(sent_counts)
  variance = sum([((x - mean) ** 2) for x in sent_counts]) / len(sent_counts)
  res = variance ** 0.5

  return mean, res

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Lexical Diversity

### Readability

In [None]:
import nltk
!pip install py-readability-metrics
!python -m nltk.downloader punkt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Installing collected packages: py-readability-metrics
Successfully installed py-readability-metrics-1.4.5
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from readability import Readability

def readability_score(document):

  r = Readability(document)

  fk = r.flesch_kincaid()
  f = r.flesch()
  ari = r.ari()

  return fk.score, f.score, ari.score

### Richness

In [None]:
!pip install lexicalrichness

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lexicalrichness
  Downloading lexicalrichness-0.1.4.tar.gz (18 kB)
Building wheels for collected packages: lexicalrichness
  Building wheel for lexicalrichness (setup.py) ... [?25l[?25hdone
  Created wheel for lexicalrichness: filename=lexicalrichness-0.1.4-py2.py3-none-any.whl size=10109 sha256=8a1bfe22246c55a407601faf494b0304ce1ad2332456a2abff4246bdc6f54f96
  Stored in directory: /root/.cache/pip/wheels/40/53/09/ce0a119b59493ae5be4e9773457df832bbce66d926fce1d043
Successfully built lexicalrichness
Installing collected packages: lexicalrichness
Successfully installed lexicalrichness-0.1.4


In [None]:
from lexicalrichness import LexicalRichness
sample_size = 10
iterations = 50 

def lexical_richness(document):
  
  lex = LexicalRichness(document)
  ret_list = []
  words = document.split()
  if len(words)>45:
    ret_list.append(lex.mattr(window_size=25))
  else:
    ret_list.append(lex.mattr(window_size=len(words)//3))
  ret_list.append(lex.mtld(threshold=0.72))
  return ret_list

## Punctuation Analysis

In [None]:
import string 

def total_punc_count(document):
  
  punct_count = 0

  for char in document:
    
    if char in string.punctuation:

      punct_count +=1
  
  return punct_count

In [None]:
import string 

def special_punc_count(document, special_puncts):
  
  punct_count = []

  for punct in special_puncts:
    
    punct_count.append(document.count(punct))
  
  total_puncts = total_punc_count(document)
  if total_puncts==0:
    return [0 for count in punct_count]
  else:
    return [float(count)/ total_puncts for count in punct_count]

In [None]:
import string 

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
import re

def special_punc_count_sent(document, special_puncts):

  tokens = sent_tokenize(document)

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  punct_count = [0 for i in special_puncts] # Init as 0 

  if not filtered:
    return punct_count

  for sent in filtered:

    for punct in special_puncts:
      
      punct_count[special_puncts.index(punct)] += sent.count(punct)
    
  return [float(count)/ len(filtered) for count in punct_count]


def special_punc_count_para(document, special_puncts):

  tokens = document.splitlines()

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  
  filtered = [w for w in tokens if nonPunct.match(w)]

  punct_count = [0 for i in special_puncts] # Init as 0 

  if not filtered:
    return punct_count

  for para in filtered:

    for punct in special_puncts:
      
      punct_count[special_puncts.index(punct)] += para.count(punct)
    
  return [float(count)/ len(filtered) for count in punct_count]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Feature Exatrction

### Combined function to get phraseology, lexical and punctuation features

In [None]:
def get_features(data):

  data_features = []

  phraseology_features = ["word_count", "sent_count", "para_count", "mean_word_count_sent", "std_word_count_sent", "mean_word_count_para", "std_word_count_para", "mean_sent_count_para", "std_sent_count_para"]
  # diversity_features = ["fk_score", "f_score", "ari_score", "mattr", "mtld"]
  diversity_features = ["mattr", "mtld"]
  special_puncts = ["!","'", ",", "-", ":", ";", "?", "@", "\"", "=", "#"]

  special_punct_names = ["excla","apos", "comma", "hypn", "col", "semicol", "ques", "at", "qot", "dhypn", "hash"]

  punct_analysis_features = ["total_punct_count"]

  for punct in special_punct_names:

    punct_analysis_features.append(punct + "_mean_count")

  for punct in special_punct_names:

    punct_analysis_features.append(punct + "_mean_count_sent")

  for punct in special_punct_names:

    punct_analysis_features.append(punct + "_mean_count_para")


  for value in (data.itertuples()):

    document = str(value.text)

    if not document:

      document = "empty"


    feature_row = []
    ## phraseology features
    # print(document)
    feature_row.append(word_count(document))
    feature_row.append(sentence_count(document))
    feature_row.append(paragraph_count(document))

    # word count per sentence

    word_count_vals = word_count_sent(document)
    feature_row.append(word_count_vals[0])
    feature_row.append(word_count_vals[1])

    # word count per paragraph
    word_count_vals = word_count_para(document)
    feature_row.append(word_count_vals[0])
    feature_row.append(word_count_vals[1])

    # sentence count per paragraph
    sent_count_vals = sent_count_para(document)
    feature_row.append(sent_count_vals[0])
    feature_row.append(sent_count_vals[1])

    ## diversity features

    reareadability = readability_score(document)
    feature_row.append(reareadability[0])
    feature_row.append(reareadability[1])
    feature_row.append(reareadability[2])

    word count per sentence
    richness = lexical_richness(document)
    feature_row.append(richness[0])
    feature_row.append(richness[1])


    ## punctuation features

    feature_row.append(total_punc_count(document))
    feature_row.extend(special_punc_count(document, special_puncts))
    feature_row.extend(special_punc_count_sent(document, special_puncts))
    feature_row.extend(special_punc_count_para(document, special_puncts))


    # append label
    #feature_row.append(value.label)
    data_features.append(feature_row)

  frame_cols = phraseology_features
  frame_cols.extend(diversity_features)
  frame_cols.extend(punct_analysis_features)
  # frame_cols.append('label')

  # print("length of feature vector (column names) ")
  # print(len(frame_cols))

  data_features = pd.DataFrame(data_features, columns=frame_cols)
  return data_features


## Model

In [None]:
!pip install ruptures
import ruptures as rpt

In [None]:
import ast
import matplotlib.pyplot as plt

def predict_timeline_mix(data, agreement_threshold=0.2, count_threshold=1):

  index = 0 

  tp = 0
  tn = 0
  fn = 0
  fp = 0 
  
  results_csv = []
  results_cols = ["index", "llr"]

  for value in tqdm(data.itertuples()):

    text = value.text
    
    output = ast.literal_eval(text)

    frame = pd.DataFrame(output, columns = ["text"])

    timeline_features = get_features(frame)

    if index == 0:

      feature_cols = timeline_features.columns

    change_point_feature_count = 0 

    for col in feature_cols:

      ts1 = timeline_features[[col]].values

      cpd_algo = rpt.Pelt(model="rbf").fit(ts1)
      change_locations = cpd_algo.predict(pen=1)

      if len(change_locations) > count_threshold:
        change_point_feature_count +=1
    
    if float(change_point_feature_count)/len(feature_cols) > agreement_threshold:

      llr = -1
      label = 0
    
    else:

      llr = 1
      label = 1
    
    results_csv.append([index, llr])

    tp += ((label == value.label) & (value.label == 1))
    tn += ((label == value.label) & (value.label == 0))
    fn += ((label != value.label) & (value.label == 1))
    fp += ((label != value.label) & (value.label == 0))

    index +=1

    recall = float(tp) / (tp+fn)
    precision = float(tp) / (tp+fp)
    f1_score = 2 * float(precision) * recall / (precision + recall)


  print('TP: %d' % (
      tp))
  print('TN: %d' % (
      tn))
  print('FP: %d' % (
      fp))
  print('FN: %d' % (
      fn))

  accuracy = 100 * (tp + tn) / (tp + tn + fp + fn)

  print(accuracy)

  results_frame = pd.DataFrame(results_csv, columns=results_cols)

  return results_frame, accuracy

In [None]:
def most_frequent(List):
    return max(set(List), key = List.count)

def predict_changepoint(data, w=0):

  index = 0 

  tp = 0
  
  results_csv = []
  results_cols = ["index", "llr"]

  for value in tqdm(data.itertuples()):

    text = value.text
    
    output = ast.literal_eval(text)

    frame = pd.DataFrame(output, columns = ["text"])

    timeline_features = get_features(frame)

    if index == 0:

      feature_cols = timeline_features.columns

    change_point_feature_count = 0 
    change_point_index = []

    for col in feature_cols:

      ts1 = timeline_features[[col]].values

      cpd_algo = rpt.Pelt(model="rbf").fit(ts1)
      change_locations = cpd_algo.predict(pen=1)

      if len(change_locations) > count_threshold:
        change_point_feature_count +=1
        change_point_index.extend(change_locations)
    
    if float(change_point_feature_count)/len(feature_cols) > agreement_threshold:  # There exisits a change point 

      pred_idx = most_frequent(change_point_index)
    
    else:

      pred_idx = -1
  
    tp += ((pred_idx <= value.index + w) || (pred_idx >= value.index + w))
  
    index +=1

  accuracy = 100 * (tp) / (index)

  print(accuracy)

  return accuracy

In [None]:
test_data = pd.read_csv("test.csv")

predict_timeline_mix(test_data, agreement_threshold=0.15, count_threshold=2)
predict_changepoint(test_data, agreement_threshold=0.15, count_threshold=2)