In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string, os 
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('drive/MyDrive/lyrics-data.csv')
df.head()

In [None]:
# drop
df.drop(['ALink','SName','SLink'],axis=1,inplace=True)

In [None]:
# value count
df['language'].value_counts()

In [None]:
#Exttracting just English language songs
df = df[df['language']=='en']

In [None]:
df.columns

In [None]:
pip install textblob

In [None]:
from textblob import TextBlob

def sentiment_label(text):
    sentiment_score = TextBlob(text).sentiment.polarity
    if sentiment_score > 0:
        return 'happy'
    elif sentiment_score < 0:
        return 'sad'
    else:
        return 'neutral'

df['sentiment'] = df['Lyric'].apply(sentiment_label)

In [None]:
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
rows_to_remove = df[df['sentiment'] == 'happy'].sample(n=79000)

In [None]:
rows_to_remove['sentiment'].value_counts()

In [None]:
df_modified = df.drop(rows_to_remove.index)

In [None]:
df_modified['sentiment'].value_counts()

In [None]:
neutral_senti_rows_to_remove = df[df['sentiment'] == 'neutral']

In [None]:
neutral_senti_rows_to_remove['sentiment'].value_counts()

In [None]:
df_modified = df_modified.drop(neutral_senti_rows_to_remove.index)

In [None]:
df_modified['sentiment'].value_counts()

In [None]:
df_modified.head()

In [None]:
df_modified['lyrics_with_sentiment'] = df_modified['sentiment'] + ": " + df_modified['Lyric']

In [None]:
df_modified.head()

In [None]:
pip install transformers


In [None]:
df_modified_truncated = df_modified.iloc[:10000]

In [None]:
df_modified_truncated['lyrics_with_sentiment'].to_csv('lyrics_dataset_modified.txt', index=False, header=None)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

# Load pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize dataset and create TextDataset
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )
    return dataset

train_dataset = load_dataset('lyrics_dataset_modified.txt', tokenizer)

# Data collator to create batches
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

In [None]:
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    save_steps=100,
    save_total_limit=2,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

In [None]:
trainer.save_model("./output")

In [None]:
from transformers import pipeline

# Load the fine-tuned model and create a text generation pipeline
generator = pipeline('text-generation', model='./output', tokenizer=tokenizer)

# Generate new lyrics with a sentiment prompt
generated_text = generator("sad: ", max_length=100, num_return_sequences=1)[0]['generated_text']
print(generated_text)