In [None]:
import sys

In [None]:
IS_GOOGLE_COLAB = 'google.colab' in sys.modules

In [None]:
if IS_GOOGLE_COLAB:
  from google.colab import drive

  drive.mount('/content/drive')

  %cd ./drive/MyDrive/behind_the_words

In [None]:
if IS_GOOGLE_COLAB:
  %pip install -q -r requirements.txt
  !python -m textblob.download_corpora

In [None]:
import os
from importlib import reload
from gensim import models
import numpy as np
import xgboost
from sklearn import metrics
import pandas as pd
import shap
from tqdm.auto import tqdm
import spacy
import language_tool_python
from spellchecker import SpellChecker
import json
import time
import tensorflow as tf
import math
tqdm.pandas()

In [None]:
BEHIND_THE_WORDS_DIR = "./"
DATA_DIR = os.path.join(BEHIND_THE_WORDS_DIR, "data")
USING_GPU = False
USE_LOCAL_W2V = False

In [None]:
_nlp = spacy.load("en_core_web_sm")
_spellchecker = SpellChecker()
_language_tool = language_tool_python.LanguageTool('en-US')

In [None]:
import sys
sys.path.append(os.path.join(BEHIND_THE_WORDS_DIR, 'metaphor/'))

In [None]:
from utils.load_word2vec import load_word2vec

w2v_model_path = os.path.join(BEHIND_THE_WORDS_DIR, "data/gensim/word2vec-google-news-300.gz")

if IS_GOOGLE_COLAB or USE_LOCAL_W2V:
  word2vec = load_word2vec(w2v_model_path)
else:
  word2vec = load_word2vec(w2v_model_path, "http://127.0.0.1:7070")

In [None]:
import metaphor.metaphor_usage as metaphor_usage
import relevant_features as rf

reload(rf)
relevant_features = rf.RelevantFeatures(_nlp, _spellchecker, _language_tool)

reload(metaphor_usage)
MetaphorUsage = metaphor_usage.MetaphorUsage
metaphor_usage = MetaphorUsage(relevant_features.get_nlp, USING_GPU)
metaphor_usage.load_model(os.path.join(BEHIND_THE_WORDS_DIR, "metaphor/models/metaphor_usage_model.pt"))
metaphor_usage.load_word2vec(word2vec)

relevant_features.set_metaphor_usage_instance(metaphor_usage)

In [None]:
import loader.JSONLDataloader as JSONLDataloader

reload(JSONLDataloader)

dataset_jsonl_dataloader = JSONLDataloader.JSONLDataloader(DATA_DIR)

In [None]:
def make_rf_dataset(rf_dataset_raw):
    return np.array([relevant_features.from_json(_json).tolist() for _json in tqdm(rf_dataset_raw.to_dict(orient='records'))])

def get_feature_names():
  feature_names = relevant_features.get_feature_names()

  return feature_names


In [None]:
rf_real_train, rf_real_test, rf_real_valid = dataset_jsonl_dataloader.get("essayforum", folders=["processed-v1", "rf", "features"], skip=0, take=36000)
rf_fake_train, rf_fake_test, rf_fake_valid = dataset_jsonl_dataloader.get("own", folders=["processed-v1", "rf","features"], skip=0, take=36000)

In [None]:
def process_datasets(real_datasets, fake_datasets):
  real_datasets = list(filter(lambda dataset: dataset is not None, real_datasets))
  fake_datasets = list(filter(lambda dataset: dataset is not None, fake_datasets))

  real = make_rf_dataset(real_datasets[0])
  fake = make_rf_dataset(fake_datasets[0])

  x = np.concatenate([real, fake], axis=0)
  y = np.concatenate([np.zeros(len(real)), np.ones(len(fake))], axis=0)

  return x, y

IS_SCIKIT_LEARN_API = False

def make_dataset(X, y=None, is_scikit_learn_api = IS_SCIKIT_LEARN_API):
  if is_scikit_learn_api:
    return X
  else:
    return xgboost.DMatrix(X, label=y)

In [None]:
train_x, y_train = process_datasets([rf_real_train], [rf_fake_train])
test_x, y_test = process_datasets([rf_real_test], [rf_fake_test])
valid_x, y_valid = process_datasets([rf_real_valid], [rf_fake_valid])

print(train_x.shape)
print(test_x.shape)
print(valid_x.shape)

dataset_train = make_dataset(X=train_x, y=y_train)
dataset_test = make_dataset(X=test_x, y=y_test)
dataset_valid = make_dataset(X=valid_x, y=y_valid)

In [None]:
watchlist = [(dataset_train, 'train'), (dataset_valid, 'validation')]
params = {'max_depth': 14, 'eta': 0.3, 'num_class': 1, 'objective': 'binary:logistic' }
model = xgboost.train(params, dataset_train, num_boost_round=100, evals=watchlist, early_stopping_rounds=15)

In [None]:
from utils.dir import make_dir

MODEL_PATH = make_dir(f'./models/rf/model-{int(time.time())}.json')
model.save_model(MODEL_PATH)
print(f"[XGBoost] Saved at {MODEL_PATH}")

In [None]:
result = model.predict(dataset_test)

y_preds = list(map(lambda x: 1 if x >= 0.5 else 0, result.tolist()))
print("Model: ", "XGBoost")
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_preds))
print("Classification Report:\n", metrics.classification_report(y_test, y_preds))
print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_preds) * 100.0))

In [None]:
def predict(txt, show_explanation=False):
  _inputs = relevant_features.get(txt)

  result = model.predict(make_dataset([_inputs]))


  if show_explanation:
    print(f"Probability of being AI: {round(result[0] * 100, 2)}%")
    print(_inputs.shape)

    _json = relevant_features.get_features_raw(txt)
    for k, v in _json.items():
      print(k, v)

    feature_names = get_feature_names()

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(np.array([_inputs]))

    shap.decision_plot(explainer.expected_value, shap_values, _inputs, feature_names, link="logit", feature_display_range=slice(-1, -len(feature_names), -1))

  return result[0]

# Human Essays
_essayforum = """The dreaded "White Ring of Death" emerged in my shrimp colony, and an eerie sense of unease washed over me. As a wide-eyed kid, I was captivated by underwater worlds, spending days creating intricate aquascapes and setting up ecosystems. My childhood fascination with aquatic life led me to own a cherry shrimp business, sharing these vibrant creatures with the world. Even though I have been tinkering with aquariums for about a decade, I have only developed a business recently. Using my knowledge in this field, I set my water parameters so that my shrimp lived their lives with faster metabolisms, this sped up their breeding cycle and allowed my colony to grow promptly. Following my first six sales, I noticed a pale flimsy shrimp in my sand. This was worrisome as I had bred high-grade sakura shrimp with a bright red color, but to my demise, my shrimp had died! After further inspection, I could see a white ring around its torso, I had never seen this before and knew something was wrong. With some research, I identified a lack of general and carbonate hardness. My solution was to introduce crushed coral, I chose this aid as it releases minerals into the water, resulting in stronger general and carbonate hardness. In the following weeks, a few more shrimps had passed. But the White ring of death was eradicated. This experience highlighted the importance of adaptability, effective management, and how to develop a systematic method of problem-solving."""
_human1 = """I have been in the situation where I almost question God why life is so unfair. I feel so down and I do not know what's been going on anymore, I don't know what to do in every single day. I'm just thinking how to survive in a day and one day I woke up and realized that this is not me anymore. Yes, I have been through a lot but that is not a reason to give up. Luckily, I was surrounded by good people who always gave me an advice and told me that I'm worth it and that was the time I learned to love myself and know my worth as a person. My self-worth is like a peso coin, it gives me something that can make me feel happy even in the smallest thing. I am known to be a jolly person and I want to get back at it, so I start to stand up and freshen up and decide to remove all the negativities and insecurities from myself and with other people. My mind is starting to occupy itself with positivity that can empower my inner self to know her worth and oppose my other self that told me that I wasn't enough. As of the moment, I start doing what would make me really happy. I'm a flawed and imperfect person, but not makes me an unworthy person. I am worth it because I'm contented, will always be proud of myself, and someone's believes in me to continue striving, growing and glowing."""
_human2 = """Many among us think that we are in the era of globalization when in fact the world was moving towards globalization since then. It's not new for us, thus we can't call it a new phenomenon. Numerous people, especially us, the students maybe did not recognize the effects of globalization in terms of the importance of education. We all know that through education, people will achieve their every goal they set for their lives. The gift of learning opens new doors to a great world that will lead us soon into a success. But, as the technology continously innovates and grow, students who are not fortunate enough to have an education in a much challenging way will be facing difficulties in their life journey as a student. As everyone knows that Philippines is included to the what we called the "developing countries" which means it is not yet fully develop because of the low budget allocated for the education. So, while others who was included in developed countries have been acquiring the use of high-technology for education purposes, here I am being left behind. After I graduate here in the Philippines, I'm planning to work abroad (specifically in the developed countries) as Forensic Specialist and it analyzes the evidences in laboratory which uses a high-technology electronic devices. Being left behind in the usage of technology in education purposes because of globalization will affect me today and in the future.  In summarizes, one nation should give attention and allocate a much higher budget in education because we can't deny the fact that we are in the process of technological innovation, so for us to have a brighter future, we must create a kind of globalization that works for everyone."""
_human3 = """All of us are unique. All of us have their own strengths and weaknesses. You may be good at doing these things but not this one or vice versa. One of my strengths is people skills, I'm easy to be friends with almost anyone and I also talk with them in a very polite way. My honesty is also one of my greatest strengths, being honest with everyone is also a plus point for me because I know I didn't cheat and lied on and I only have myself until the end. And lastly, I am a hardworking person. As long as I can, I will do it as soon as possible. I do it all for my family because they are my real strengths and they are the one who motivates and inspires me to do good for my future. One of my weaknesses is speaking in public, although I tried already speaking in front of many people still I stand with lack of confidence in myself. Another weakness that I have is I am mentally unstable like I easily get breakdowns and got trust issues. But above all, all those strengths and weaknesses are both my weapons in reaching my dream self. I'm currently working with my weaknesses to lessen and make it my strengths. I know that I can because with God everything is possible."""
_human4 = """One characteristic of a coin is its luster. Just like a coin, I shine. I shine in my own way, I shine in every single thing I do. It may be good or bad outcomes as long as I did my best, it is okay. I do love Nadine Lustre, the ambassador of lustrous makeup brand, but still with or without makeup, she shines and I shine and that is what makes me become a lustrous, like a shining shimmering gem that you could ever see. Another coin is a circle in shape, whenever I do something, I give it my best, expect me that my whole attention and love for what I'm doing is in. It's not totally have a perfect result like a coin but it almost."""

# AI Essays
_short = """In conclusion, blockchain technology represents a paradigm shift in how we manage data, conduct transactions, and establish trust. Its decentralized and transparent nature has the potential to disrupt various industries, unlocking new possibilities and efficiencies. As the technology continues to evolve, addressing challenges and adapting to regulatory changes will be crucial in realizing the full potential of blockchain and ushering in a new era of trust and transparency in the digital age. The journey of blockchain is not only a technological evolution but a transformative force shaping the future of our interconnected world."""
_long = """Smart contracts, another innovation facilitated by blockchain, automate and enforce contractual agreements. These self-executing contracts have their terms encoded in code, eliminating the need for intermediaries and streamlining processes. The potential applications of smart contracts span various fields, from legal agreements to automated business processes, contributing to increased efficiency and reduced risk of errors. The impact of blockchain extends across diverse industries. In finance, blockchain has the potential to revolutionize cross-border payments, reduce fraud, and increase the efficiency of transactions. Supply chain management benefits from the transparency and traceability offered by blockchain, enhancing accountability and reducing the risk of counterfeit products. Healthcare, voting systems, and real estate are just a few examples of industries exploring blockchain applications to address challenges related to security, transparency, and trust. However, the adoption of blockchain is not without challenges. Scalability, energy consumption, and regulatory uncertainties are areas that require ongoing attention and development. Efforts are underway to address these issues, with the development of scaling solutions and more energy-efficient consensus algorithms. In conclusion, blockchain technology represents a paradigm shift in how we manage data, conduct transactions, and establish trust. Its decentralized and transparent nature has the potential to disrupt various industries, unlocking new possibilities and efficiencies. As the technology continues to evolve, addressing challenges and adapting to regulatory changes will be crucial in realizing the full potential of blockchain and ushering in a new era of trust and transparency in the digital age. The journey of blockchain is not only a technological evolution but a transformative force shaping the future of our interconnected world."""
_dogs = """In the tapestry of human existence, the bond between humans and dogs stands out as one of unparalleled depth and significance. Two dogs, Sky and Cloud, each brought a unique charm to their respective families, embodying the diverse roles that dogs play in our lives. Sky, the majestic German Shepherd, became an unwavering guardian for the Johnson family. His imposing presence and acute senses made him a reliable protector, and his loyalty endeared him to every member of the family. From joyful romps in the backyard to quiet nights standing watch, Sky became an integral part of the Johnsons' lives, providing not only security but also companionship and joy. On the other side of the spectrum was Cloud, a playful Border Collie with a spirit as bright as his snowy white coat. In the Anderson household, Cloud's boundless enthusiasm and love for play brought laughter and smiles to everyone. His favorite pastime, chasing butterflies in the backyard, showcased his joy for life. Beyond the playfulness, Cloud also served as a source of comfort and emotional support. His ability to sense and respond to the emotions of the Anderson family created a unique bond, demonstrating the profound impact dogs can have on our emotional well-being. Despite belonging to different families, the paths of Sky and Cloud intersected at the local park, highlighting the universal qualities that dogs bring into our lives. Sky's watchful gaze and Cloud's exuberant playfulness created a harmonious balance, representing the diverse roles that dogs play as both guardians and sources of joy. These chance encounters at the park showcased the interconnectedness of human-canine relationships and the shared experiences that bind dog owners together. In the tales of Sky and Cloud, we find a reflection of the multifaceted nature of the canine-human connection. Dogs, with their loyalty, love, and unique personalities, enrich our lives in ways that extend beyond mere companionship. Sky and Cloud, each with their distinctive qualities, exemplify the diverse roles that dogs play as guardians, playmates, and sources of emotional support. Through their heartwarming stories, we are reminded of the enduring bond that exists between humans and their four-legged friends, a bond that adds depth, joy, and meaning to the human experience."""
_gpt = """Title: The Art and Science of Programming Introduction: Programming, often described as the art and science of instructing computers to perform specific tasks, has become an indispensable tool in our technologically-driven world. It is a creative and logical process that empowers individuals to design, build, and optimize software, which in turn shapes the digital landscape. This essay explores the multifaceted nature of programming, delving into its history, significance, and the skills required to master this intricate craft. Historical Evolution: The roots of programming can be traced back to the mid-19th century when mathematician Ada Lovelace conceptualized the idea of a machine that could perform any intellectual task. However, the true genesis of modern programming can be attributed to the invention of electronic computers in the mid-20th century. Pioneers like Alan Turing and Grace Hopper laid the foundation for the development of programming languages and compilers, setting the stage for the digital revolution. The Significance of Programming: Programming is the cornerstone of software development, enabling the creation of applications that power our daily lives. From mobile apps and web development to complex systems like operating systems and artificial intelligence, programming is the driving force behind technological innovation. In essence, it is the language through which humans communicate with machines, giving them the ability to execute precise and intricate tasks. Creativity in Code: Contrary to the common misconception that programming is a purely analytical and logical endeavor, it is, at its core, a highly creative process. Writing code is akin to crafting a symphony, with lines of code serving as musical notes that, when orchestrated skillfully, produce a harmonious and functional piece of software. Programmers are akin to artists, using logic and syntax as their brushes to paint digital masterpieces. Problem-Solving and Critical Thinking: Programming fosters a unique approach to problem-solving and critical thinking. Programmers encounter challenges that require meticulous analysis, logical reasoning, and the ability to break down complex problems into manageable components. The iterative nature of coding encourages continuous refinement and optimization, teaching individuals to approach problem-solving with resilience and adaptability. Diversity of Programming Languages: The programming landscape is vast and diverse, offering a plethora of programming languages tailored for specific tasks and preferences. From the versatility of Python and the efficiency of C++ to the web-centric nature of JavaScript, programmers have a rich palette to choose from. Each language has its strengths and weaknesses, catering to different aspects of software development and contributing to the richness of the programming ecosystem. Continuous Learning and Adaptation: Programming is a dynamic field that demands continuous learning and adaptation. The rapid evolution of technology requires programmers to stay abreast of the latest developments, frameworks, and best practices. This perpetual learning process is not only a necessity but also an opportunity for personal and professional growth, making programming a field that rewards curiosity and a thirst for knowledge. Conclusion: In conclusion, programming is a multifaceted discipline that combines creativity, logic, and problem-solving skills. It has evolved from the early days of punch cards and assembly languages to the diverse and dynamic ecosystem of programming languages and frameworks we have today. As our world becomes increasingly reliant on technology, the role of programming in shaping our digital future becomes ever more crucial. It is an art form that empowers individuals to bring their ideas to life, solve complex problems, and contribute to the continual progress of our interconnected world. In mastering the craft of programming, individuals not only become proficient coders but also architects of the digital landscape that defines our modern era."""
_test = """Okay, imagine you have a big box of colorful building blocks. JavaScript is like playing with these blocks without worrying too much about the shape or color. You can build cool things, but sometimes it might get a bit confusing if you mix things up. Now, TypeScript is like having another friend who helps you organize these blocks. Before you start playing, you both decide what kind of blocks you'll use and how you'll use them. This way, your friend (TypeScript) makes sure you don't mix up the wrong blocks, and if you make a mistake, it tells you before you start building. So, JavaScript lets you play freely with your building blocks, while TypeScript helps you play in a more organized way by deciding what types of blocks you'll use and making sure everything fits together nicely."""

print("HUMAN")
print("_essayforum", predict(_essayforum))
print("_human1", predict(_human1))
print("_human2", predict(_human2))
print("_human3", predict(_human3))
print("_human4", predict(_human4))
print("")
print("AI")
print("_short", predict(_short))
print("_long", predict(_long))
print("_dogs", predict(_dogs))
print("_gpt", predict(_gpt))
print("_test", predict(_test))

In [None]:
def explain_model():
  feature_names = get_feature_names()

  order = range(len(feature_names))

  dataset = np.concatenate([train_x])
  explainer = shap.Explainer(model, dataset, feature_names=feature_names)
  shap_values = explainer(dataset)

  shap.plots.beeswarm(shap_values, order=order, max_display=len(feature_names) + 1)

explain_model()

In [None]:
def model_correlation():
  import matplotlib.pyplot as plt
  import seaborn as sns

  explainer = shap.TreeExplainer(model)
  shap_values = explainer.shap_values(dataset_test)
  feature_names = get_feature_names()

  shap_df = pd.DataFrame(shap_values, columns=pd.Index(feature_names, name='features'))
  plt.figure(figsize=(9,7), dpi=200)
  feat_order = shap_df.abs().mean().sort_values().index.tolist()[::-1]
  sns.heatmap(shap_df.corr().abs().loc[feat_order, feat_order])

model_correlation()