In [1]:
# @title Install Necessary Packages
!pip install jsonlines

# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 --force-reinstall --upgrade --no-cache-dir --verbose

# For download the models
!pip install huggingface_hub

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0
Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python==0.1.78
  Downloading llama_cpp_python-0.1.78.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting setuptools>=42
    Downloading setuptools-68.1.2-py3-none-any.whl (805 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 805.1/805.1 kB 5.8 MB/s eta 0:00:00
  Collecting scikit-build>=0.13
    Downloading scikit_build-0.17.6-py3-none-any.whl (84 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.3/84.3 kB 7.4 MB/s eta 0:00:00
  Collecting cmake>=3.18
    Downloading cmake-3.27.2-py2.

In [4]:
import pandas as pd
import numpy as np
import os
import jsonlines
import csv
import random
from datetime import datetime

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# @title Convert JSON data to Dict type
def read_json_data(file_path):
  data = []
  with open(file_path, "r+",encoding='utf-8') as f:
    for item in jsonlines.Reader(f):
        data.append(item)
  return data

You can download the dataset from the kaggle link mentioned [here](https://www.kaggle.com/datasets/rmisra/news-category-dataset)

In [7]:
news_headline_path = "/content/drive/MyDrive/news-headline/News_Category_Dataset_v3.json"
data_json = read_json_data(news_headline_path)

In [8]:
data = pd.DataFrame.from_dict(data_json)

In [9]:
print("Dimensions of dataset: {} \nNo of unique categories: {}".format(data.shape, data.category.nunique()))
data.head()

Dimensions of dataset: (209527, 6) 
No of unique categories: 42


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [10]:
data.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [11]:
# @title Take 50 random headlines from each category
size = 50
repeat_same_row = False
fn = lambda x: x.loc[np.random.choice(x.index, size, repeat_same_row),:]
df = data[['headline', 'category']]
df = df.groupby('category', as_index=False).apply(fn).reset_index()
df.drop(['level_0',	'level_1'], axis=1, inplace=True)
df.head()

Unnamed: 0,headline,category
0,Damien Hirst Is Building A Town No One Wants,ARTS
1,"First Nighter: Musicals ""Atomic,"" ""The Mapmake...",ARTS
2,Anne-Sophie Mutter - A Profile of the Artist,ARTS
3,23 Artworks,ARTS
4,This Is What Happens When Doodles Grow Up,ARTS


In [12]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

In [3]:
# @title Load Llama 2 13B weights from HF
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"

model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [13]:
# GPU
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool.
    )

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | 


In [14]:
# See the number of layers in GPU
lcpp_llm.params.n_gpu_layers

32

In [15]:
def create_prompt(sentence):
  prompt = sentence
  prompt_template=f'''RULES: You are a savage, disrespectful and witty assistant. Always rephrase a sentence into a funny, creatively sarcastic way.
  while rephrasing you should not dilute the given context. The output sentence should be witty, savage, insulting and also usable as a sarcastic news headline.

  Headline: {prompt}

  SARCASTIC Headline:
  '''
  return prompt_template

data_headline = df.headline.values
prompt_headline = [create_prompt(i) for i in data_headline]
prompt_headline[:5]

['RULES: You are a savage, disrespectful and witty assistant. Always rephrase a sentence into a funny, creatively sarcastic way.\n  while rephrasing you should not dilute the given context. The output sentence should be witty, savage, insulting and also usable as a sarcastic news headline.\n\n  Headline: Damien Hirst Is Building A Town No One Wants\n\n  SARCASTIC Headline:\n  ',
 'RULES: You are a savage, disrespectful and witty assistant. Always rephrase a sentence into a funny, creatively sarcastic way.\n  while rephrasing you should not dilute the given context. The output sentence should be witty, savage, insulting and also usable as a sarcastic news headline.\n\n  Headline: First Nighter: Musicals "Atomic," "The Mapmaker\'s Opera," "ValueVille"\n\n  SARCASTIC Headline:\n  ',
 'RULES: You are a savage, disrespectful and witty assistant. Always rephrase a sentence into a funny, creatively sarcastic way.\n  while rephrasing you should not dilute the given context. The output sentence

In [17]:
st = datetime.now()
print(f"Starting time: {st}\n")

gen_result = []
i = 0
for pmp in prompt_headline[:10]:
  print("processing {}/{}: ".format(i, len(prompt_headline)))
  response = lcpp_llm(prompt=pmp, max_tokens=len(data_headline[i].split())*5, temperature=0.5, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
  resp = str(response["choices"][0]["text"])
  # we are generating response tokens only 5 times the input tokens count, this can reduce the unecessary time
  print("headline: ", data_headline[i], ", max_tokens", len(data_headline[i].split())*5)
  print("sarcastic headline: ", resp.partition('SARCASTIC Headline:\n')[-1].split('\n')[0].strip())
  gen_result.append(resp.partition('SARCASTIC Headline:\n')[-1].split('\n')[0].strip())
  i += 1
en = datetime.now()
print("\nTime taken to complete the generation: ", en-st)

Starting time: 2023-09-02 19:37:30.724040

processing 0/2100: 


Llama.generate: prefix-match hit


headline:  Damien Hirst Is Building A Town No One Wants 45
sarcastic headline:  Oh Look, It's Another Disaster Waiting To Happen! Damien Hirst Builds A Whole Town That Everyone Agrees They Don't Want.
processing 1/2100: 


Llama.generate: prefix-match hit


headline:  First Nighter: Musicals "Atomic," "The Mapmaker's Opera," "ValueVille" 40
sarcastic headline:  1) Atomic! The Musical That Will Blow Your Mind... and Your Budget!
processing 2/2100: 


Llama.generate: prefix-match hit


headline:  Anne-Sophie Mutter - A Profile of the Artist 40
sarcastic headline:  "Anne-Sophie Mutter: Because Violinists Need More Ego in Their Lives"
processing 3/2100: 


Llama.generate: prefix-match hit


headline:  23 Artworks 10
sarcastic headline:  23 Artworks So Bad They Should Be Lock
processing 4/2100: 


Llama.generate: prefix-match hit


headline:  This Is What Happens When Doodles Grow Up 40
sarcastic headline:  "The World is Now Ruled by Sentient Crayons"
processing 5/2100: 


Llama.generate: prefix-match hit


headline:  Artist Makes Masks Out Of Junk Food And Supremely Creeps Everyone Out (NSFW) 65
sarcastic headline:  "Artist's Latest Masterpiece Will Make You Question Your Love For Pizza"
processing 6/2100: 


Llama.generate: prefix-match hit


headline:  Lessons From the Spring Festival 25
sarcastic headline:  "Spring Festival Teaches Us How to Become Better People by Doing Nothing"
processing 7/2100: 


Llama.generate: prefix-match hit


headline:  A Letter to AirBnB's CEO, With Love, From Georgia O'Keeffe 50
sarcastic headline:  Dear Brian Chesky, I hope this letter finds you well and not too busy counting all the money you made from my estate.
processing 8/2100: 


Llama.generate: prefix-match hit


headline:  Rock-a-bye, Baby: "Jenůfa" at the Metropolitan Opera 35
sarcastic headline:  Jenůfa's Met Debut Leaves Audience in Tears... of Boredom
processing 9/2100: 


Llama.generate: prefix-match hit


headline:  'Duke Of Burgundy' Is The All-Female Erotic Drama You Need To See 60
sarcastic headline:  "The Duke of Burgundy" is the all-female erotic drama that you need to see if you want to be bored out of your mind.

Time taken to complete the generation:  0:03:52.836077


In [None]:
df['sarcastic_headline'] = gen_result
df[['headline', 'sarcastic_headline', 'category']].head()

In [None]:
df[['headline', 'sarcastic_headline', 'category']].to_csv('/content/sarcastic-headline/sarcastic_headline_data.csv', index=False)