# Sentiment analysis of Formula 1 radio messages

This is the second notebook of the task of analyzing Formula 1 radios and extracting valuable information.

For the sentiment analysis, first, we will use **nltk** with Vader. 

#### Importing necessar libraries and downloading vader lexicon

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from tqdm.notebook import tqdm

# Set styling for visualizations
plt.style.use('ggplot')
sns.set(style="whitegrid")

In [11]:
# Download necessary NLTK resources for sentiment analysis
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

--- 

### Loading the transcribed messages


In [12]:
data_path = "../../outputs/week4/radios_raw.csv"

radio_data = pd.read_csv(data_path)

# Display basic information

print(f"Number of transcribed radio messages: {len(radio_data)}")

radio_data.head()

Number of transcribed radio messages: 210


Unnamed: 0,driver,filename,file_path,text,duration
0,1,"driver_(1,)_belgium_radio_39.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",Are we following the ringer or what? You just ...,15.168
1,1,"driver_(1,)_belgium_radio_40.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...","Okay Max, we're expecting rain in about 9 or 1...",15.576
2,1,"driver_(1,)_belgium_radio_60.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...","Mayor Manilow, mayI X TEA ****",5.424
3,1,"driver_(1,)_belgium_radio_62.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",You might find this lap that you meet a little...,5.088
4,1,"driver_(1,)_belgium_radio_63.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",Just another two or three minutes to get throu...,5.712


--- 

### Initializing Vader sentiment analyzer

In [13]:
# Initializing the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Get the sentiment scores
def get_sentiment_scores(text):
    # I add an exception for if the radio message is empty
    if pd.isna(text) or text =="":
        return {"neg": 0 , "neu": 0, "pos": 0, "compound": 0}
    
    return sid.polarity_scores(text)

---
### Applying VADER


In [14]:
# We apply vader to each transcription

tqdm.pandas(desc = "Analyzing sentiment")
sentiment_scores = radio_data["text"].progress_apply(get_sentiment_scores)

# Convert sentiment scores to Dataframe Columns

sentiment_df = pd.DataFrame(sentiment_scores.to_list())
radio_data = pd.concat([radio_data, sentiment_df], axis = 1)

# Adding a simple category based on compound score

def categorize_sentiment(compound):
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"
    
radio_data["sentiment"] = radio_data["compound"].apply(categorize_sentiment)

# Display the updated dataframe

radio_data.head()

Analyzing sentiment:   0%|          | 0/210 [00:00<?, ?it/s]

Unnamed: 0,driver,filename,file_path,text,duration,neg,neu,pos,compound,sentiment
0,1,"driver_(1,)_belgium_radio_39.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",Are we following the ringer or what? You just ...,15.168,0.059,0.691,0.25,0.7351,positive
1,1,"driver_(1,)_belgium_radio_40.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...","Okay Max, we're expecting rain in about 9 or 1...",15.576,0.0,0.941,0.059,0.3485,positive
2,1,"driver_(1,)_belgium_radio_60.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...","Mayor Manilow, mayI X TEA ****",5.424,0.0,1.0,0.0,0.0,neutral
3,1,"driver_(1,)_belgium_radio_62.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",You might find this lap that you meet a little...,5.088,0.0,1.0,0.0,0.0,neutral
4,1,"driver_(1,)_belgium_radio_63.mp3","..\..\f1-strategy\data\audio\driver_(1,)\drive...",Just another two or three minutes to get throu...,5.712,0.0,1.0,0.0,0.0,neutral


---
### Analyzing sentiment distribution

In [None]:
# Analyze sentiment distribution
sentiment_counts = radio_data['sentiment'].value_counts()
print("Sentiment distribution:")
print(sentiment_counts)
print(f"Percentage of positive messages: {sentiment_counts.get('positive', 0) / len(radio_data) * 100:.2f}%")
print(f"Percentage of neutral messages: {sentiment_counts.get('neutral', 0) / len(radio_data) * 100:.2f}%")
print(f"Percentage of negative messages: {sentiment_counts.get('negative', 0) / len(radio_data) * 100:.2f}%")



In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=radio_data, palette={'positive': 'green', 'neutral': 'gray', 'negative': 'red'})
plt.title('Sentiment Distribution of F1 Team Radio Messages', fontsize=15)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

---
### Analyzing sentiment distribution by driver


In [None]:
# Examine sentiment distribution by driver
driver_sentiment = pd.crosstab(radio_data['driver'], radio_data['sentiment'], normalize='index') * 100
driver_sentiment = driver_sentiment.sort_values(by='positive', ascending=False)

# Plot driver sentiment distribution
plt.figure(figsize=(12, 8))
driver_sentiment.plot(kind='bar', stacked=True, 
                     colormap='viridis', figsize=(12, 8))
plt.title('Sentiment Distribution by Driver (%)', fontsize=15)
plt.xlabel('Driver', fontsize=12)
plt.ylabel('Percentage', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()