<a href="https://colab.research.google.com/github/Shirley-333/Text_Analysis_Final_Project/blob/main/Text_Analysis_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
url="https://raw.githubusercontent.com/Pistonrings/Song-Lyrics-Analysis-2012-to-2022/refs/heads/master/billboard_2012_to_2022_top_100_song_lyrics.csv"

In [None]:
df = pd.read_csv(url)

In [None]:
print(df.columns.tolist())

['Top100Year', 'SongTitle', 'Artist', 'LyricsStatus', 'Lyrics', 'ReleaseYear', 'Genre']


In [None]:

df.head(3)

Unnamed: 0,Top100Year,SongTitle,Artist,LyricsStatus,Lyrics,ReleaseYear,Genre
0,2012,Call Me Maybe,Carly Rae Jepsen,True,"I threw a wish in the well\nDon't ask me, I'll...",2012,"R&B/Soul, Pop"
1,2012,Payphone,Maroon 5 Featuring Wiz Khalifa,True,I'm at a payphone trying to call home\nAll of ...,2012,"R&B/Soul, Pop"
2,2012,Somebody That I Used To Know,Gotye Featuring Kimbra,True,Now and then I think of when we were together\...,2011,"R&B/Soul, Pop"


In [None]:
df = df[df['LyricsStatus'] == True]

In [None]:
df = df.dropna(subset=['Genre', 'Lyrics'])

In [None]:
print(df['Genre'].nunique())

219


In [None]:
print(len(df))

962


In [None]:
df[['SongTitle', 'Artist', 'Genre', 'Lyrics']].head(5)

Unnamed: 0,SongTitle,Artist,Genre,Lyrics
0,Call Me Maybe,Carly Rae Jepsen,"R&B/Soul, Pop","I threw a wish in the well\nDon't ask me, I'll..."
1,Payphone,Maroon 5 Featuring Wiz Khalifa,"R&B/Soul, Pop",I'm at a payphone trying to call home\nAll of ...
2,Somebody That I Used To Know,Gotye Featuring Kimbra,"R&B/Soul, Pop",Now and then I think of when we were together\...
3,Wide Awake,Katy Perry,"R&B/Soul, Pop, UK R&B",I'm wide awake\nI'm wide awake\nI'm wide awake...
4,Where Have You Been,Rihanna,"Dance-pop, R&B/Soul, Electro house, Pop, UK R&B","I've been everywhere, man\nLooking for someone..."


In [None]:
genre_counts = df['Genre'].value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']

In [None]:
print(genre_counts.head(10))

                                           Genre  Count
0                         Country music, Country    190
1                          R&B/Soul, Hip-Hop/Rap    118
2                                  R&B/Soul, Pop     51
3                         Alternative/Indie, Pop     37
4                      Bachata, Musica tropicale     26
5               Alternative/Indie, R&B/Soul, Pop     18
6                            Hip-Hop/Rap, UK Rap     17
7                          R&B/Soul, Pop, UK R&B     17
8                     R&B/Soul, Pop, Hip-Hop/Rap     12
9  Alternative/Indie, R&B/Soul, Pop, Hip-Hop/Rap     12


In [None]:
fig = px.bar(
    genre_counts.head(10).sort_values('Count', ascending=True),
    x='Count', y='Genre',
    orientation='h',
    title='Top 10 Genres by Song Count (2012–2022)',
    labels={'Genre': 'Music Genre', 'Count': 'Number of Songs'},
)
fig.update_traces(marker_color='pink')
fig.update_layout(
    plot_bgcolor='whitesmoke',
    paper_bgcolor='white',
    font=dict(size=14))
fig.show()

In [None]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer

# download required packages (same as in class)
needed = ["punkt_tab", "stopwords", "wordnet", "averaged_perceptron_tagger_eng"]
for pkg in needed:
    nltk.download(pkg, quiet=True)

In [None]:
def wordnet_pos(tag):
    """Map POS tag to WordNet POS tag format"""
    return {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }.get(tag[0], wordnet.NOUN)

In [None]:
def tokenize_norm(text):
    tok = RegexpTokenizer(r"[A-Za-z0-9]+")
    return [t.lower() for t in tok.tokenize(str(text))]

In [None]:
def remove_stopwords(tokens):
    stops = set(stopwords.words("english"))
    return [t for t in tokens if t not in stops]


In [None]:
def pos_lemmatize(tokens):
    tags = pos_tag(tokens)
    lem = WordNetLemmatizer()
    return [lem.lemmatize(t, wordnet_pos(tag)) for t, tag in tags]

In [None]:
def run_pipeline(text):
    tokens = tokenize_norm(text)
    tokens_nostop = remove_stopwords(tokens)
    clean = pos_lemmatize(tokens_nostop)
    return clean

In [None]:
sample_text = df['Lyrics'].iloc[0]
clean_tokens = run_pipeline(sample_text)

print("Original length:", len(sample_text.split()))
print("Cleaned token count:", len(clean_tokens))
print("Top 30 tokens:", clean_tokens[:30])

Original length: 416
Cleaned token count: 168
Top 30 tokens: ['threw', 'wish', 'well', 'ask', 'never', 'tell', 'look', 'fell', 'way', 'trade', 'soul', 'wish', 'penny', 'dim', 'kiss', 'look', 'way', 'stare', 'holdin', 'rip', 'jean', 'skin', 'showin', 'hot', 'night', 'wind', 'blowin', 'think', 'go', 'baby']


In [None]:
df['clean_tokens'] = df['Lyrics'].head(200).apply(run_pipeline)


In [None]:
import nltk
from nltk.probability import FreqDist

# Lexical diversity: unique words ÷ total words
def lexical_diversity(tokens):
    tokens = [t for t in tokens if isinstance(t, str)]
    return len(set(tokens)) / max(1, len(tokens))

In [None]:
df['lexical_diversity'] = df[df['clean_tokens'].notna()]['clean_tokens'].apply(lexical_diversity)

In [None]:
genre_diversity = (
    df.groupby('Genre', as_index=False)['lexical_diversity']
    .mean()
    .sort_values('lexical_diversity', ascending=False))

In [None]:
genre_diversity.head(10)

Unnamed: 0,Genre,lexical_diversity
10,"Alternative/Indie, Dance/Electronic, Pop, Hip-...",0.660633
49,"Country music, House music, Folktronica, Dance...",0.6
189,"Rap rock, Electronic rock, Alternative/Indie, Pop",0.492172
3,"Alternative rock, Pop rock, Indie rock, Altern...",0.480333
18,"Alternative/Indie, R&B/Soul, Dance/Electronic,...",0.477366
153,"Pop, Country",0.471342
163,"R&B/Soul, Alternative/Indie, Pop",0.470982
76,"Electronic dance music, Electropop, Dance Pop",0.458515
46,"Country music, Country",0.455014
17,"Alternative/Indie, R&B/Soul, Children's Music,...",0.448669


In [None]:
import plotly.express as px

fig = px.bar(
    genre_diversity,
    x='lexical_diversity',
    y='Genre',
    orientation='h',
    title='Average Lexical Diversity by Genre (2012–2022)',
    labels={'lexical_diversity': 'Average Lexical Diversity', 'Genre': 'Music Genre'})
fig.update_traces(marker_color='pink')

fig.update_layout(
    plot_bgcolor='whitesmoke',
    paper_bgcolor='white',
    font=dict(size=14))

fig.show()

In [None]:
fig = px.bar(
    genre_diversity.head(10).sort_values('lexical_diversity', ascending=True),
    x='lexical_diversity',
    y='Genre',
    orientation='h',
    title='Top 10 Genres by Lexical Diversity (2012–2022)',
    labels={'lexical_diversity': 'Average Lexical Diversity', 'Genre': 'Music Genre'})

fig.update_traces(marker_color='pink')
fig.update_layout(
    plot_bgcolor='whitesmoke',
    paper_bgcolor='white',
    font=dict(size=14))

fig.show()

In [None]:
from nltk.probability import FreqDist

In [None]:
subset = df[df['Genre'].str.contains('Pop', case=False, na=False)]


In [None]:
from nltk.probability import FreqDist
import pandas as pd
import plotly.express as px

subset = df[df['Genre'].str.contains('Pop', case=False, na=False)]
print("Pop songs found:", len(subset))

all_words = [word for tokens in subset['clean_lyrics'] for word in tokens]
fdist = FreqDist(all_words)
top_words = fdist.most_common(20)
word_df = pd.DataFrame(top_words, columns=['Word', 'Count'])

fig = px.bar(
    word_df,
    x='Count',
    y='Word',
    orientation='h',
    title='Top 20 Words in Pop Lyrics',
    labels={'Count': 'Frequency', 'Word': 'Word'})

fig.update_traces(marker_color='#ff99cc')
fig.update_layout(
    plot_bgcolor='whitesmoke',
    paper_bgcolor='white',
    font=dict(size=14))
fig.show()

Pop songs found: 494


In [None]:
# Code structure refined with the help of ChatGPT
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon', quiet=True)

sia = SentimentIntensityAnalyzer()

# Create sentiment column (analyzing each lyric)
df['sentiment'] = df['Lyrics'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [None]:
subset = df[df['Genre'].str.contains('Pop', case=False, na=False)]

subset['Main_Genre'] = subset['Genre'].apply(
    lambda x: 'Pop' if 'Pop' in str(x) and 'Hip-Hop' not in str(x)
    else 'Hip-Hop/Rap' if 'Hip-Hop' in str(x) else 'Other')
genre_sentiment_compare =(
    subset.groupby('Main_Genre', as_index=False)['sentiment']
    .mean()
    .sort_values('sentiment', ascending=False))

import plotly.express as px

fig = px.bar(
    genre_sentiment_compare,
    x='Main_Genre',
    y='sentiment',
    title='Average Sentiment: Pop vs Hip-Hop/Rap (2012–2022)',
    labels={'sentiment': 'Average Sentiment Score', 'Main_Genre': 'Music Genre'}
)

fig.update_traces(marker_color='pink')
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(size=15),
    xaxis=dict(tickangle=0),
    yaxis=dict(range=[-1, 1])
)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

