# Чтение файла и фильтрация слов

In [1]:
import pandas as pd

songs = pd.read_csv('data/songs.csv')
songs = songs[:3000]

allowed_chars = set(list("qwertyuiopasdfghjklzxcvbnm' "))
mapping = lambda s: ''.join(list(filter(lambda c: c in allowed_chars, s)))
songs['Lyrics'] = songs['Lyrics'].apply(lambda s: mapping(str(s)))

# Подсчёт 5% наиболее частых слов

In [2]:
from collections import Counter

words = ' '.join(songs['Lyrics']).split(' ')
counts = Counter(words)
limit = int(0.05 * len(counts.keys()))
ordered = counts.most_common()
most_common = ordered[:limit]
print(most_common)

[('the', 23367), ('i', 22775), ('you', 21562), ('to', 13591), ('and', 13137), ('a', 13079), ('me', 11010), ('my', 8923), ('in', 8384), ('it', 7887), ('of', 6585), ('your', 6173), ("i'm", 6058), ('on', 5588), ('that', 5346), ('all', 4863), ('be', 4765), ('is', 4672), ('we', 4574), ('for', 4531), ('so', 4178), ("don't", 4107), ('but', 3802), ('love', 3692), ('know', 3624), ('no', 3611), ('just', 3370), ('like', 3331), ("it's", 3279), ('this', 3036), ('what', 3001), ('with', 2984), ('when', 2965), ('', 2954), ('up', 2817), ('can', 2625), ('got', 2609), ('if', 2566), ('oh', 2522), ('do', 2491), ("you're", 2476), ('now', 2460), ('are', 2409), ('out', 2404), ('go', 2394), ('down', 2350), ('one', 2212), ('get', 2135), ('not', 2115), ('was', 2087), ('never', 2085), ('yeah', 2033), ('time', 2008), ('come', 1902), ("can't", 1881), ('let', 1850), ('will', 1844), ('want', 1778), ('have', 1770), ('see', 1729), ('che', 1677), ('they', 1622), ('at', 1603), ('say', 1592), ('e', 1577), ('gonna', 1562),

# Удаление из текстов стоп-слов

In [3]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')

filter_significant = lambda s: ' '.join(
    list(filter(lambda w: not ((w, counts[w]) in most_common or (w, counts[w]) in stopwords), s.split(' ')))
)
songs['Lyrics'] = songs['Lyrics'].apply(filter_significant)
pd.DataFrame(songs['Lyrics']).to_csv('data/lyrics.csv', index=False, header=False)

# Разметка текстов песен по настроению (Positive, Neutral, Negative) с помощью GPT-3.5

In [4]:
from openai import OpenAI
import csv
import time

client = OpenAI(api_key='deleted')

header = ['Artist', 'Song', 'Sentiment', 'Sentiment_rate', 'Lyrics']

file = open('data/sentiment_songs.csv', 'a+', encoding='utf-8')
writer = csv.DictWriter(file, fieldnames=header, lineterminator='\n')
writer.writeheader()

for i in range(71, 100):
    for k in range(3):
        request = "What is the sentiment of the following texts:\n"
        for j in range(10):
            request = request + songs['Lyrics'].iloc[i * 30 + k * 10 + j] + "\n"
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system",
                 "content": "You have to determine the sentiment of the listed 10 songs lyrics and answer for each text in one word. The sentiment can be Positive, Neutral, Negative. Your answer must be a list of 10 comma-separated values."},
                {"role": "user", "content": request}
            ]
        )
        answers = response.choices[0].message.content.split(',')
        print(answers)
        for j in range(len(answers)):
            answer = answers[j].strip()
            rate = 0
            if answer == 'Positive':
                rate = 1
            elif answer == 'Negative':
                rate = -1
            writer.writerow({
                'Artist': songs['Artist'].iloc[i * 30 + k * 10 + j],
                'Song': songs['Song'].iloc[i * 30 + k * 10 + j],
                'Sentiment': answer,
                'Sentiment_rate': rate,
                'Lyrics': songs['Lyrics'].iloc[i * 30 + k * 10 + j],
            })
    time.sleep(60)
file.close()


KeyboardInterrupt



# Подготовка к токенизации текстов

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(vocab_size=5000)
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train(['data/lyrics.csv'], trainer)

# Обучение модели для определения настроения текста

In [25]:
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import extract_unigram_feats
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

sentiments = pd.read_csv('data/sentiment_songs.csv')
lyrics_train, lyrics_test, sentiment_train, sentiment_test = (
    train_test_split(sentiments['Lyrics'], sentiments['Sentiment'], test_size=0.3))

classifier = SklearnClassifier(MLPClassifier(hidden_layer_sizes=(10,), activation='tanh', alpha=0.001))
analyzer = SentimentAnalyzer()
train_size = len(lyrics_train)
train_data = [None] * train_size
for i in range(train_size):
    train_data[i] = (
        tokenizer.encode(lyrics_train.iloc[i]).tokens,
        sentiment_train.iloc[i]
    )
unigram_feats = analyzer.unigram_word_feats(analyzer.all_words(train_data))
analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
train_set = analyzer.apply_features(train_data, labeled=True)
analyzer.train(classifier.train, train_set)

Training classifier


<SklearnClassifier(MLPClassifier(activation='tanh', alpha=0.001, hidden_layer_sizes=(10,)))>

# Подсчёт метрики на тренировочном и тестовом множествах

In [27]:
from sklearn.metrics import accuracy_score

train_results = [None] * train_size
for i in range(train_size):
    train_results[i] = analyzer.classify(tokenizer.encode(lyrics_train.iloc[i]).tokens)

test_size = len(lyrics_test)
test_results = [None] * test_size
for i in range(test_size):
    test_results[i] = analyzer.classify(tokenizer.encode(lyrics_test.iloc[i]).tokens)
    
print("Train accuracy score: ", accuracy_score(sentiment_train, train_results))
print("Test accuracy score: ", accuracy_score(sentiment_test, test_results))

Train accuracy score:  0.8867165692131219
Test accuracy score:  0.5774804905239688
