In [2]:
import pandas as pd

df = pd.read_csv('trump_data/trumptweets.csv')
print(df.head())

           id                                               link  \
0  1698308935  https://twitter.com/realDonaldTrump/status/169...   
1  1701461182  https://twitter.com/realDonaldTrump/status/170...   
2  1737479987  https://twitter.com/realDonaldTrump/status/173...   
3  1741160716  https://twitter.com/realDonaldTrump/status/174...   
4  1773561338  https://twitter.com/realDonaldTrump/status/177...   

                                             content                 date  \
0  Be sure to tune in and watch Donald Trump on L...  2009-05-04 20:54:25   
1  Donald Trump will be appearing on The View tom...  2009-05-05 03:00:10   
2  Donald Trump reads Top Ten Financial Tips on L...  2009-05-08 15:38:08   
3  New Blog Post: Celebrity Apprentice Finale and...  2009-05-08 22:40:15   
4  "My persona will never be that of a wallflower...  2009-05-12 16:07:28   

   retweets  favorites mentions hashtags  geo  
0       500        868      NaN      NaN  NaN  
1        33        273      NaN 

In [3]:
print(df.columns)

Index(['id', 'link', 'content', 'date', 'retweets', 'favorites', 'mentions',
       'hashtags', 'geo'],
      dtype='object')


In [4]:
max_retweet_row = df.loc[df['retweets'].idxmax()]

# Afficher la ligne
print(max_retweet_row)

id                                          881503147168071680
link         https://twitter.com/realDonaldTrump/status/881...
content         # FraudNewsCNN # FNNpic.twitter.com/WYUnHjjUjg
date                                       2017-07-02 15:21:42
retweets                                                309892
favorites                                               528265
mentions                                                   NaN
hashtags                                                   # #
geo                                                        NaN
Name: 32059, dtype: object


In [5]:
import re
def clean_tweet(tweet):
    # Supprimer les URLs, hashtags, et mentions
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#','', tweet)
    return tweet

In [6]:
df['cleaned_tweet'] = df['content'].apply(clean_tweet)
print(df['cleaned_tweet'].head())
print(len(df))

0    Be sure to tune in and watch Donald Trump on L...
1    Donald Trump will be appearing on The View tom...
2    Donald Trump reads Top Ten Financial Tips on L...
3    New Blog Post: Celebrity Apprentice Finale and...
4    "My persona will never be that of a wallflower...
Name: cleaned_tweet, dtype: object
41122


In [8]:
dataset = df[:]

In [9]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:

# Load the tokenizer and model
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)


# Split the DataFrame into training and validation sets
df_train, df_val = train_test_split(dataset, test_size=0.1, random_state=42)  # 20% for validation

# Prepare the dataset for training and evaluation
train_encodings = tokenizer(df_train['cleaned_tweet'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(df_val['cleaned_tweet'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

# Create a dataset class
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.encodings['input_ids'][idx])  # Use the same input as labels for language modeling
        }

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create the datasets
train_dataset = TweetDataset(train_encodings)
val_dataset = TweetDataset(val_encodings)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./fine_tuned_llama_trump',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    fp16=True
)

# Initialize the Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Provide the evaluation dataset here
)

# Start training
print("Training [...]")
trainer.train()
print("Train finished")

# Save the fine-tuned model and tokenizer
print("Saving [...]")
model.save_pretrained("./fine_tuned_llama_trump")
tokenizer.save_pretrained("./fine_tuned_llama_trump")




Training [...]


  0%|          | 0/4 [00:00<?, ?it/s]

  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'labels': torch.tensor(self.encodings['input_ids'][idx])  # Use the same input as labels for language modeling
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.2080397605896, 'eval_runtime': 0.858, 'eval_samples_per_second': 1.166, 'eval_steps_per_second': 1.166, 'epoch': 1.0}


  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'labels': torch.tensor(self.encodings['input_ids'][idx])  # Use the same input as labels for language modeling


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.192970275878906, 'eval_runtime': 1.6361, 'eval_samples_per_second': 0.611, 'eval_steps_per_second': 0.611, 'epoch': 2.0}
{'train_runtime': 281.1882, 'train_samples_per_second': 0.064, 'train_steps_per_second': 0.014, 'train_loss': 2.7757365703582764, 'epoch': 2.0}
Train finished
Saving [...]


('./fine_tuned_llama_trump\\tokenizer_config.json',
 './fine_tuned_llama_trump\\special_tokens_map.json',
 './fine_tuned_llama_trump\\tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_llama_trump"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


In [22]:
# Define your prompt
prompt = "My economic politics is "

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate text
output = model.generate(
    input_ids,
    max_length=50,           # Set the maximum length of the generated text
    num_return_sequences=1,   # Number of generated sequences
    no_repeat_ngram_size=2,   # Prevent repetition
    top_k=50,                 # Top-K sampling to increase diversity
    top_p=0.95,               # Nucleus sampling to filter out low-probability tokens
    temperature=0.7           # Control randomness
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


My economic politics is 100% on the side of free enterprise, but I've always been a believer in a free market that is also a democracy.


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F


# Define your prompt
prompt = "I don't like"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate text with step-by-step probability output
generated_text = input_ids
max_length = 50  # Set the maximum length of the generated text

for _ in range(max_length - len(input_ids[0])):
    # Get model output (logits) for the current token
    with torch.no_grad():
        outputs = model(generated_text)
        logits = outputs.logits[:, -1, :]  # Focus on the logits for the last generated token

    # Get probabilities for the top tokens
    probs = F.softmax(logits, dim=-1)
    top_k_probs, top_k_indices = torch.topk(probs, k=10)  # Top 10 tokens

    # Decode the last generated token as context
    context_word = tokenizer.decode(generated_text[0, -1], skip_special_tokens=True)
    print(f"Previous word: '{context_word}'")

    # Decode the top tokens with their probabilities
    top_tokens = tokenizer.convert_ids_to_tokens(top_k_indices[0].tolist())
    top_probs = top_k_probs[0].tolist()

    print("Top Tokens and Probabilities:")
    for token, prob in zip(top_tokens, top_probs):
        print(f"{token}: {prob:.4f}")

    # Sample the next token from the probability distribution
    next_token = top_k_indices[0][0]  # Greedy approach, can also use sampling

    # Append the next token to the generated sequence
    generated_text = torch.cat((generated_text, next_token.unsqueeze(0).unsqueeze(0)), dim=1)

# Decode the final generated text
final_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(final_text)
