In [1]:
#
# Check whether cuda is available & the device(s) we have available
#
import torch

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_properties(i).name)
else:
    print("cuda not available")

Tesla V100-PCIE-32GB


In [2]:
#
# Install necessary libraries
#

%pip install -U bitsandbytes
%pip install -U peft
%pip install -U trl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [48]:
#
# Import libraries & disable environment weights/biases tool
#

import pandas as pd
import re
import os

from datasets import load_dataset
from datasets import Dataset
from transformers import LlamaForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

os.environ["WANDB_DISABLED"] = "true" # disable weights and biases tool

In [41]:
import pandas as pd 

df_before = pd.read_csv("data/realDonaldTrump_bf_office.csv", on_bad_lines="skip")
df_during = pd.read_csv("data/realDonaldTrump_in_office.csv", on_bad_lines="skip")

# Combine dataframes & drop the ID column because the ID is always the same
mean_tweets = pd.concat([df_before, df_during], ignore_index=True).drop("id", axis=1)

mean_tweets


Unnamed: 0,time,tweet_url,tweet_text
0,2009-05-04 13:54,https://twitter.com/realDonaldTrump/status/16...,"""Be sure to tune in and watch Donald Trump on..."
1,2009-05-04 20:00,https://twitter.com/realDonaldTrump/status/17...,"""Donald Trump will be appearing on The View t..."
2,2009-05-08 08:38,https://twitter.com/realDonaldTrump/status/17...,"""Donald Trump reads Top Ten Financial Tips on..."
3,2009-05-08 15:40,https://twitter.com/realDonaldTrump/status/17...,"""New Blog Post: Celebrity Apprentice Finale a..."
4,2009-05-12 09:07,https://twitter.com/realDonaldTrump/status/17...,"""'My persona will never be that of a wallflow..."
...,...,...,...
32347,2021-01-06 22:16,https://twitter.com/realDonaldTrump/status/13...,"""Even Mexico uses Voter I.D."""
32348,2021-01-06 23:44,https://twitter.com/realDonaldTrump/status/13...,"""These scoundrels are only toying with the @s..."
32349,2021-01-07 03:38,https://twitter.com/realDonaldTrump/status/13...,"""Please support our Capitol Police and Law En..."
32350,2021-01-07 05:17,https://twitter.com/realDonaldTrump/status/13...,"""https://t.co/Pm2PKV0Fp3"""


In [49]:
#
# Clean the data before prompt engineering
#

text_to_filter = ["Donald Trump", "@realDonaldTrump", "Apprentice", "RT"]
def filter_ad_tweets(text):
    for filt in text_to_filter:
        if filt in text:
            return False
        
    return True

# Remove the beginning and ending quotations
mean_tweets["tweet_text"] = mean_tweets["tweet_text"].str.strip('"')

# Filter tweets that probably aren't mean
mean_tweets = mean_tweets[mean_tweets["tweet_text"].apply(filter_ad_tweets)]

# Replace "‚Äô" with "'"- weird data parsing issue
mean_tweets["tweet_text"] = mean_tweets["tweet_text"].str.replace("‚Äô", "'", case=True, regex=False)

# Remove links from tweet text
mean_tweets["tweet_text"] = mean_tweets["tweet_text"].str.replace(r"http\S+|www\S+", "", case=True, regex=True)

# Remove rows with empty tweet text
mean_tweets = mean_tweets[mean_tweets["tweet_text"].str.strip().astype(bool)]

# mean_tweets.dropna()
mean_tweets

Unnamed: 0,time,tweet_url,tweet_text
4,2009-05-12 09:07,https://twitter.com/realDonaldTrump/status/17...,"""'My persona will never be that of a wallflow..."
6,2009-05-14 11:30,https://twitter.com/realDonaldTrump/status/17...,"""'Strive for wholeness and keep your sense of..."
7,2009-05-15 09:13,https://twitter.com/realDonaldTrump/status/18...,"""Enter the 'Think Like A Champion' signed boo..."
8,2009-05-17 10:00,https://twitter.com/realDonaldTrump/status/18...,"""'Don't be afraid of being unique - it's like..."
9,2009-05-18 09:26,https://twitter.com/realDonaldTrump/status/18...,"""'We win in our lives by having a champion's ..."
...,...,...,...
32343,2021-01-06 13:43,https://twitter.com/realDonaldTrump/status/13...,"""Get smart Republicans. FIGHT!"
32346,2021-01-06 21:06,https://twitter.com/realDonaldTrump/status/13...,"""Sleepy Eyes Chuck Todd is so happy with the ..."
32347,2021-01-06 22:16,https://twitter.com/realDonaldTrump/status/13...,"""Even Mexico uses Voter I.D."
32348,2021-01-06 23:44,https://twitter.com/realDonaldTrump/status/13...,"""These scoundrels are only toying with the @s..."


In [46]:
# TODO: Combine the 'product' and 'category' columns into an 'instruction' column
# The instruction should follow this format: 
# "Create a detailed description for the following product: [product], belonging to category: [category]"
rd_df['instruction'] = "Create a detailed description for the following product: " + rd_df['product'] + ", belonging to category: " + rd_df['category']

# TODO: Create a new DataFrame with only the 'instruction' and 'description' columns
rd_instruction_description_df = rd_df[['instruction', 'description']]

# This is the template for formatting our training data
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

### Response:\n"""

# TODO: Create a 'prompt' column by applying the template to each instruction
rd_instruction_description_df['prompt'] = rd_instruction_description_df['instruction'].apply(lambda x: template.format(x))
rd_instruction_description_df.rename(columns={'description': 'response'}, inplace=True)
rd_instruction_description_df['response'] = rd_instruction_description_df['response'] + "\n### End"

# TODO: Select only the 'prompt' and 'response' columns
rd_instruction_description_df = rd_instruction_description_df[['prompt', 'response']]

# TODO : Combine the 'prompt' and 'response' columns into a single 'text' column
rd_instruction_description_df['text'] = rd_instruction_description_df['prompt'] + rd_instruction_description_df['response']

# TODO : Drop the 'prompt' and 'response' columns from rd_instruction_description_df and reset the index
rd_instruction_description_df = rd_instruction_description_df.drop(columns=['prompt', 'response']).reset_index(drop=True)

# Display the final DataFrame
rd_instruction_description_df

NameError: name 'rd_df' is not defined