# Init

Add the following varaibles:

- sentiment
- objectivity
- complexity
- grammer error

# Sentiment

Use transformer's built-in pipeline

In [2]:
import datatable as dt

from datatable import f, update
from transformers import pipeline
from utilpy import sv, ld

WORK_DIR = '/home/yu/OneDrive/Construal'
os.chdir(WORK_DIR)

In [3]:
# create pipeline
#   - The default model is 'distilbert-base-uncased'
classifier = pipeline("sentiment-analysis", device=0)

# load TEXT data (the `summary` variable in pjson)
pjson = ld('pjson')
summary = pjson['summary'].to_list()[0]
title = pjson['title'].to_list()[0]
pids = pjson['pid'].to_list()[0]

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


"pjson.feather" (167.5 MB) loaded (<1s) (2022-01-15 7:23 PM)


In [None]:
# output sentiment!
summary_sentiment = dt.Frame(classifier(summary))
title_sentiment = dt.Frame(classifier(title))

summary_sentiment[:, update(pid = dt.Frame(pids))]
title_sentiment[:, update(pid = dt.Frame(pids))]

# save results
sv(summary_sentiment, 'summary_sentiment')
sv(title_sentiment, 'title_sentiment')

# Readability

In [3]:
import concurrent
import datatable as dt
import os

from datatable import f, update
from tqdm.auto import tqdm
from utilpy import sv, ld

WORK_DIR = '/home/yu/OneDrive/Construal'
os.chdir(WORK_DIR)

## spelling/grammar errors

In [10]:
# load TEXT data (the `summary` variable in pjson)
pjson = ld('pjson')
project_desc = pjson['project_desc'].to_list()[0]
title = pjson['title'].to_list()[0]
pids = pjson['pid'].to_list()[0]

import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

# only select spelling and grammar errors
def get_n_errors(text, pid):
    matches = tool.check(text)
    n_spelling_errors = sum([1 for match in matches if match.category in ['TYPOS']])
    n_grammar_errors = sum([1 for match in matches if match.category in ['GRAMMAR']])
    return (pid, n_spelling_errors, n_grammar_errors)

with concurrent.futures.ProcessPoolExecutor() as executor:
    grammar_error_desc = list(executor.map(get_n_errors, project_desc, pids))
    grammar_error_title = list(executor.map(get_n_errors, title, pids))

grammar_error_desc = dt.Frame(grammar_error_desc, names=['pid', 'n_spelling_errors', 'n_grammar_errors'])
grammar_error_title = dt.Frame(grammar_error_title, names=['pid', 'n_spelling_errors', 'n_grammar_errors'])

sv(grammar_error_desc, 'grammar_error_desc')
sv(grammar_error_title, 'grammar_error_title')

"pjson.feather" (167.5 MB) loaded (<1s) (2022-01-15 6:17 PM)
Saved as "grammar_error_desc.feather" (618.3 KB) (<1s) (2022-01-15 6:54 PM)
Saved as "grammar_error_title.feather" (544.1 KB) (<1s) (2022-01-15 6:54 PM)


## Fog index

> There's *NO* Fog for title because Fog requires at least 100 words

In [4]:
# pip install py-readability-metrics

from readability import Readability

# load TEXT data (the `summary` variable in pjson)
pjson = ld('pjson')
project_desc = pjson['project_desc'].to_list()[0]
title = pjson['title'].to_list()[0]
pids = pjson['pid'].to_list()[0]

def get_fog_score(text, pid):
    try:
        fog = Readability(text).gunning_fog().score
    except:
        fog = None
    return (pid, fog)

with concurrent.futures.ProcessPoolExecutor() as executor:
    fog_desc = list(executor.map(get_fog_score, project_desc, pids))
    fog_title = list(executor.map(get_fog_score, title, pids))

fog_desc = dt.Frame(fog_desc, names=['pid', 'fog_score'])

sv(fog_desc, 'fog_desc')

"pjson.feather" (167.5 MB) loaded (<1s) (2022-01-15 7:01 PM)
Saved as "fog_desc.feather" (815.5 KB) (<1s) (2022-01-15 7:02 PM)
Saved as "fog_title.feather" (502.1 KB) (<1s) (2022-01-15 7:02 PM)


# Objectivity

> See "train-objectivity.py" and "pred-objectivity.py" for model training and prediction

In [33]:
'''Dreprecated
import pickle
import datatable as dt

from datatable import f
from pyarrow.feather import write_feather, read_feather

with open('/home/yu/OneDrive/Construal/data/objectivity/objectivity-predictions-avg-chunk.pkl', 'rb') as file:
    objectivity = pickle.load(file)

pids = []
ys = []
for pid, y in objectivity:
    pids.extend(pid)
    ys.extend(y)

out_objectivity = dt.Frame(pid=pids, objectivity=ys)
sv('out_objectivity', path='/home/yu/OneDrive/Construal/data')
'''

"out_objectivity" saved as "out_objectivity.feather" (502.0 KB) (<1s) (2021-10-26 10:44 PM)


# Merge all data (R)

In [1]:
suppressMessages(library(utilr))
WORK_DIR = '/home/yu/OneDrive/Construal'
setwd(WORK_DIR)

Yu's data science toolbox loaded! 


In [3]:
ld('title_sentiment')
ld('grammar_error_title')
ld('objectivity-predictions_title', ldname='obj_title', path='./data/objectivity')

title_sentiment (954.2 KB) already loaded, will NOT load again! (0 secs) (2022-01-15 7:26 PM)
"grammar_error_title.feather" (544.1 KB) loaded (0.01 secs) (2022-01-15 7:26 PM)


In [25]:
text_additional = title_sentiment[, .(pid, sentiment_label=label, sentiment_score=score)
    ][grammar_error_title, on=.(pid), nomatch=NULL
    ][obj_title, on=.(pid), nomatch=NULL]

fwrite(text_additional, './data/sharing/text_additional.csv')

In [26]:
text_additional[1]

pid,sentiment_label,sentiment_score,n_spelling_errors,n_grammar_errors,objectivity
<chr>,<chr>,<dbl>,<int>,<int>,<int>
1000064918,NEGATIVE,0.6155048,0,0,0
