In [2]:
import os
import re
import torch
import torch.nn as nn
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, pipeline, BertConfig
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm  # Import tqdm for progress tracking

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load and Preprocess the Data
nltk.download('stopwords')
nltk.download('punkt')

# Directory containing the .txt files
DATA_DIR = "C:/Users/snigd/Downloads/train_data"

# Function to preprocess text
def preprocess_text(text):
    # Remove non-word characters and extra whitespace
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize words and remove stopwords
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    preprocessed_text = ' '.join(words)
    return preprocessed_text

# Function to load and preprocess the dataset file-by-file
def load_dataset(data_dir):
    all_texts = []
    # Use tqdm to add a progress bar for file processing
    for filename in tqdm(os.listdir(data_dir), desc="Processing files"):
        if filename.endswith(".txt"):
            file_path = os.path.join(data_dir, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    preprocessed_text = preprocess_text(text)
                    all_texts.append(preprocessed_text)  # Append the entire preprocessed text
            except UnicodeDecodeError:
                with open(file_path, 'r', encoding='ISO-8859-1') as file:
                    text = file.read()
                    preprocessed_text = preprocess_text(text)
                    all_texts.append(preprocessed_text)  # Append the entire preprocessed text
    return all_texts

# Load and preprocess the dataset file-by-file with progress tracking
texts = load_dataset(DATA_DIR)

# Assuming all preprocessed texts are loaded into a DataFrame
data = pd.DataFrame({'Commentary': texts})

# Display the first few rows of the DataFrame to verify
print(data.head())




Using device: cuda


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\snigd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\snigd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Processing files: 100%|██████████| 3840/3840 [00:06<00:00, 620.99it/s]

                                          Commentary
0  man claiming associate Chinese tennis star Pen...
1  Lennon brands Rangers favourites Celtics Neil ...
2  Nigerias Blessing Oborududu advanced Final wom...
3  Gerrard happy Anfield Liverpool captain Steven...
4  Williams battles Aussie title Serena Williams ...





In [4]:

#  get BERT embeddings for a given text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding.squeeze()


In [6]:
# Define a list of sport names for extraction
sport_names = ["soccer/football", "american football", "basketball", "cricket", "tennis", "baseball", "athletics"]

# Labels for multi-label encoding
label_list = [
    "score-related", 
    "assist/playmaking", 
    "foul/penalty",  
    "substitution/injury", 
    "defense actions",  
    "game outcome" 
]

label_to_id = {label: i for i, label in enumerate(label_list)}

contextual_words = {
    "soccer/football": [
        "goal post", "corner", "free kick", "penalty", "offside", 
        "crossbar", "goal", "netted", "assist", "yellow card", 
        "red card", "goalkeeper", "defender", "midfielder", "striker", "winger",
        "goal", "header", "equalizer", "stoppage time", "penalty shootout",
        "counter-attack", "corner kick", "soccer", "FIFA", "UEFA Champions League"
    ],
    "basketball": [
        "hoop", "three-pointer", "dribble", "slam dunk", "free throw", "dribble", "guard",
        "alley-oop", "backboard", "dunk", "rebound", "assist", "pass", "block",
        "steal", "turnover", "fast break", "point guard", "shooting guard", 
        "small forward", "power forward", "center", "layup", "buzzer-beater",
        "man-to-man defense", "zone defense", "basketball", "NBA", "NCAA"
    ],
    "american football": [
        "end zone", "field goal", "touchdown", "huddle", "scrimmage", 
        "quarterback sneak", "line of scrimmage", "sack", "fumble", "punt", 
        "interception", "pass interference", "tackle", "field goal", "tackle",
        "quarterback", "running back", "wide receiver", "linebacker", "kickoff",
        "american football", "NFL", "football", "american football", "Super Bowl"
    ],
    "cricket": [
        "pitch", "crease", "boundary", "overs", "wicket", "innings", "bowling", "batting", 
        "stump", "run rate", "batsman", "bowler", "fielder", "wicketkeeper",
        "run out", "century", "no ball", "wide ball", "catch out", "duck",
        "maiden over", "leg spin", "off spin", "googly", "reverse swing", "cricket",
        "ICC Cricket World Cup", "Indian Premier League (IPL)", "ICC T20 World Cup"
    ],
    "tennis": [
        "court", "serve", "racket", "volley", "backhand", "forehand", "deuce", "set", "singles", "doubles",
        "advantage", "tiebreak", "ace", "match point", "double fault", "break point",
        "server", "receiver", "baseline", "drop shot", "lob", "grand slam", "grass court", 
        "tennis", "Wimbledon", "US Open", "French Open", "Australian Open", "rally"
   ],
    "baseball": [
        "pitcher's mound", "home plate", "strike zone", "innings", 
        "dugout", "double play", "home run", "pitcher", "catcher", "baseman",
        "outfielder", "strikeout", "hit", "bunt", "curveball", "fastball",
        "walk-off", "line drive", "baseball", "World Series", "Nippon Series", 
         "College World Series", "World Baseball Classic (WBC)"
    ],
    "athletics": [
        "track", "lane", "field", "relay", "javelin", "high jump", "long jump", 
        "marathon", "hurdle", "race", "sprint", "throw", "jump", "swim", "dive", "gymnastics",
        "sprinter", "marathoner", "runner", "jumper", "thrower", "hurdler", "wrestling", 
        "pole vaulter", "shot putter", "decathlete", "heptathlete", "starting block",
        "photo finish", "personal best", "athletics", "olympics", "athlete", "Olympics", "Commonwealth Games"
    ],
   
}

# Helper function to count and weigh matches
def weighted_count_matches(words, text, boost_keywords=None, boost_factor=1):
    count = 0
    for word in words:
        if word in text:
            count += 1
            # Apply a higher boost if the word is in boost_keywords
            if boost_keywords and word in boost_keywords:
                count += boost_factor  # Apply the boost factor more strongly
    return count

def generate_multi_label(text):
    labels = [0] * len(label_list)
    text = text.lower()
    sports = {}

    # Score-Related Events
    if weighted_count_matches([
        "goal", "netted", "touchdown", "td", "scored", "wicket", "lbw", 
        "six", "boundary", "maximum", "four", "goal line", "conversion", 
        "penalty kick", "field goal", "extra point", "try", "strike", "hit",
        "home run", "grand slam", "run", "free throw", "service game", 
        "inning", "shot", "swing", "drive", "smash", "overhead", "homer",
        "inning", "power play", "hat-trick", "equalizer"
    ], text) > 1:
        labels[label_to_id["score-related"]] = 1

    # Assist/Playmaking
    if weighted_count_matches([
        "assist", "played in", "pass", "setup", "set up", "through ball", 
        "cross", "assist", "feed", "layup", "pass completion", "dummy run",
        "lob", "kick out", "drive", "fast break", "pick and roll", "volley",
        "build-up play", "one-two", "give-and-go", "support"
    ], text, boost_keywords=["assist", "pass", "layup"], boost_factor=3) > 0:
        labels[label_to_id["assist/playmaking"]] = 1

    # Foul/Penalty
    if weighted_count_matches([
        "foul", "penalty", "yellow card", "booked", "red card", "sent off", 
        "dismissed", "ejection", "error", "violation", "handball", "offside", 
        "technical foul", "flag", "encroachment", "false start", "holding",
        "unsportsmanlike conduct", "pass interference", "illegal contact",
        "foul trouble", "flagrant foul", "charging", "blocking foul"
    ], text, boost_keywords=["foul", "penalty", "red card"], boost_factor=3) > 0:
        labels[label_to_id["foul/penalty"]] = 1

    # Substitution/Injury
    if weighted_count_matches([
        "substitution", "replaced", "injury", "hurt", "wounded", "injuries", 
        "substitute", "replacement", "cramp", "knock", "concussion", 
        "withdrawn", "hamstring", "broken", "ligament", "pain", "horrific",
        "limping", "injury time out", "stretchered off", "medical"
    ], text) > 0:
        labels[label_to_id["substitution/injury"]] = 1

    # Defense Actions
    if weighted_count_matches([
        "tackle", "rebound", "save", "block", "defense", "defensive play", 
        "sack", "fumble", "punt", "interception", "clearance", "block shot", 
        "defensive stop", "steal", "turnover", "forced fumble", "goal-line stand",
        "double play", "steal", "throw-in", "corner", "restart", "fast break",
        "pressure", "slide tackle", "marking"
    ], text, boost_keywords=["block", "steal", "interception", "tackle"], boost_factor=3) > 0:
        labels[label_to_id["defense actions"]] = 1

    # Game Outcome
    if weighted_count_matches([
        "victory", "win", "secured", "dominated", "triumph", "conquered", 
        "prevailed", "loss", "defeat", "lost", "outplayed", 
        "clinched", "beat",  "narrow win", "world record",
        "draw", "stalemate", "tie"
    ], text) > 1:
        labels[label_to_id["game outcome"]] = 1

    # Determine the sport based on contextual words
    for sport, words in contextual_words.items():
        match_count = weighted_count_matches(words, text)
        if match_count > 1: 
            sports[sport] = match_count

    # Sort sports based on the count of matching contextual words (desc)
    sorted_sports = sorted(sports.items(), key=lambda x: x[1], reverse=True)
    final_sport = sorted_sports[0][0] if sorted_sports else None

    return labels, final_sport

# Apply the multi-label encoding to the dataset
data['labels'], data['sports'] = zip(*data['Commentary'].apply(generate_multi_label))
print(f"Number of rows after filtering: {len(data)}")


Number of rows after filtering: 3840


In [7]:
# Convert 'labels' column to string to easily compare with a string representation of [0,0,0,0,0,0]
data['labels_str'] = data['labels'].apply(lambda x: str(x))

# Filter out rows where 'labels' is [0,0,0,0,0,0] and 'sports' is None or empty
data = data[(data['labels_str'] != str([0, 0, 0, 0, 0, 0])) | (data['sports'].notna() & data['sports'].astype(bool))]

# Drop the helper column 'labels_str'
data = data.drop(columns=['labels_str'])

# Checking the resulting DataFrame
print(f"Number of rows after filtering: {len(data)}")
print(data.head())


Number of rows after filtering: 2184
                                          Commentary              labels  \
1  Lennon brands Rangers favourites Celtics Neil ...  [1, 0, 0, 0, 0, 1]   
2  Nigerias Blessing Oborududu advanced Final wom...  [0, 0, 0, 0, 0, 0]   
3  Gerrard happy Anfield Liverpool captain Steven...  [1, 0, 0, 0, 0, 0]   
4  Williams battles Aussie title Serena Williams ...  [1, 0, 1, 1, 0, 1]   
5  Redknapps Saints face Pompey tie New Southampt...  [1, 0, 0, 0, 0, 1]   

            sports  
1  soccer/football  
2        athletics  
3  soccer/football  
4           tennis  
5             None  


In [10]:
# Calculate the number of rows labeled for each label
label_counts = {label: 0 for label in label_list}

for labels in data['labels']:
    for i, label in enumerate(labels):
        if label == 1:
            label_counts[label_list[i]] += 1

# Print the number of rows labeled for each label
for label, count in label_counts.items():
    print(f"Number of rows labeled '{label}': {count}")


Number of rows labeled 'score-related': 1251
Number of rows labeled 'assist/playmaking': 629
Number of rows labeled 'foul/penalty': 346
Number of rows labeled 'substitution/injury': 842
Number of rows labeled 'defense actions': 437
Number of rows labeled 'game outcome': 1082


In [12]:
# Step 3: Using Class Weights
labels_df = pd.DataFrame(data['labels'].tolist(), columns=label_list)

# Flatten the labels to compute class weights
flat_labels = np.array(labels_df.values.flatten())

# Step 1: Calculate Initial Class Weights
initial_class_weights = compute_class_weight('balanced', classes=np.unique(flat_labels), y=flat_labels)
class_weights_dict = {i: weight for i, weight in enumerate(initial_class_weights)}
# Calculate class weights
# Define a threshold below which classes are considered as minority
minority_threshold = 500  

adjusted_class_weights_dict = class_weights_dict.copy()
for label, weight in class_weights_dict.items():
    if labels_df[label_list[label]].sum() < minority_threshold:
        adjusted_class_weights_dict[label] = weight * 2.5  # Multiply by 2 to make weights more aggressive

adjusted_class_weights_tensor = torch.tensor(list(adjusted_class_weights_dict.values())).float().to(device)

In [13]:
from datasets import Dataset

# Step 4: Convert the dataset to Dataset objects
X_train, X_test, y_train, y_test = train_test_split(data['Commentary'], labels_df, test_size=0.25, random_state=42)

train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.values.tolist()})
test_dataset = Dataset.from_dict({'text': X_test.tolist(), 'label': y_test.values.tolist()})

# Tokenization function
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_format(batch):
    tokenized_input = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    labels = torch.tensor(batch['label'], dtype=torch.float32)  # Ensure labels are float tensors for BCEWithLogitsLoss
 # Long tensor for multi-class labels
    return {**tokenized_input, 'labels': labels}


train_dataset = train_dataset.map(tokenize_and_format, batched=True)
test_dataset = test_dataset.map(tokenize_and_format, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/1638 [00:00<?, ? examples/s]

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

In [16]:


# Load pre-trained model configuration and modify it
config = BertConfig.from_pretrained(
    'bert-base-cased',
    num_labels=len(label_list),
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.3,  
    attention_probs_dropout_prob=0.3,  
    num_hidden_layers=12  
)

# Load the modified BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-cased', config=config).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Custom loss function with label smoothing
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, outputs, labels):
        # Apply label smoothing
        labels = labels * (1 - self.smoothing) + 0.5 * self.smoothing
        loss_fct = nn.BCEWithLogitsLoss()
        return loss_fct(outputs, labels)

label_smoothing_loss = LabelSmoothingLoss(smoothing=0.1)

# Define Custom Loss Function in Trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.optimizer = None

    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.args.learning_rate)
        return self.optimizer

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = label_smoothing_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define the compute_metrics function
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).cpu().numpy()
    preds = (preds > 0.5).astype(int)  
    labels = p.label_ids
    accuracy = (preds == labels).mean()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=1)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    save_total_limit=3,
    fp16=True,
    logging_steps=10,
)

# Initialize the trainer with custom loss
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6024,0.583469,0.729243,0.795513,0.346586,0.410582
1,0.5726,0.573997,0.732295,0.825429,0.310285,0.40891
2,0.5506,0.557105,0.750305,0.78603,0.39153,0.498932
4,0.5288,0.535247,0.765568,0.785032,0.450303,0.549894
5,0.5199,0.52127,0.782967,0.776837,0.528954,0.610379
6,0.5071,0.515042,0.786325,0.78607,0.531547,0.617523
8,0.498,0.506186,0.792125,0.821384,0.520311,0.620068
9,0.5003,0.516038,0.779304,0.84343,0.455488,0.57911
10,0.4882,0.501854,0.790904,0.829144,0.508211,0.614964
12,0.4909,0.491531,0.803114,0.80833,0.570441,0.649272


TrainOutput(global_step=765, training_loss=0.5195182541616602, metrics={'train_runtime': 465.9935, 'train_samples_per_second': 52.726, 'train_steps_per_second': 1.642, 'total_flos': 1608455646038016.0, 'train_loss': 0.5195182541616602, 'epoch': 14.926829268292684})

In [19]:
trainer.evaluate()

{'eval_loss': 0.4901435971260071,
 'eval_accuracy': 0.8073870573870574,
 'eval_precision': 0.8217118512667938,
 'eval_recall': 0.5713050993949871,
 'eval_f1': 0.6565025560188652,
 'eval_runtime': 2.7432,
 'eval_samples_per_second': 199.04,
 'eval_steps_per_second': 25.153,
 'epoch': 14.926829268292684}

In [68]:
import re
import torch
from tqdm import tqdm


sia = SentimentIntensityAnalyzer()

# new data
new_texts = [
'''a big way to end it if ever there was a dream australian open finals match-up the 2017 men's championship decider was 
surely it few could believe the ageing Roger Federer and Rafael Nadal had each won six matches to arrive at the big dance especially
after injury interrupted seasons in 2016. Nadal had survived five setters against Alexander Zverev and Grigor Dimitrov to reach the final
while federer had to see off ki Nishikori and Stan Vavrinka to earn his spot it was the spaniard who led thehead-to-head 23-11 but federer
had the better record on hard courts with four ao crowns already under his belt we joined the action with nadal having opened up a three-love
lead in the second set but they're starting to hit his strap she's making that shot in the first set wasn't he[Applause]the 15 from that side got to 
keep an eye on that shot '''

    
]

# Function to preprocess and tokenize the new texts
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def preprocess_and_tokenize(texts):
    processed_texts = [preprocess_text(text) for text in texts]
    tokenized_inputs = tokenizer(
        processed_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
    )
    return tokenized_inputs

# Regex pattern to match anomalous team names 
team_patterns = re.compile(r'\b(?:[A-Z][a-z]+(?:\s[A-Z][a-z]+))\s(?:\d+ers)\b', re.IGNORECASE)

# Function to extract entities (players and teams) 
def extract_entities(text):
    doc = nlp(text)
    players = set()
    teams = set()

    # List of articles to remove
    articles = {"the", "a", "an", "of", "for"}

    # Extract standard entities using NLP
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            players.add(ent.text)
        elif ent.label_ in {"ORG", "GPE"}:
           
            filtered_team_name = " ".join([token for token in ent.text.split() if token.lower() not in articles])
            teams.add(filtered_team_name)

    # Apply custom pattern matching for team names 
    matches = team_patterns.findall(text)
    for match in matches:
        # Remove any articles before the matched team names
        match_tokens = match.split()
        filtered_match = " ".join([token for token in match_tokens if token.lower() not in articles])
        teams.add(filtered_match.strip())

    return list(players), list(teams)

# Function to perform sentiment analysis on text
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

# Updated function to determine the sport based only on contextual keywords
def extract_sports(text):
    text = text.lower()  # Convert text to lowercase
    sports_priority = []
    contextual_words_detected = {}  

    # Extract sports based on contextual keywords
    for final_sport, words in contextual_words.items():
        matching_words = [word for word in words if word in text]
        if len(matching_words) > 1: 
            sports_priority.append(final_sport)
            contextual_words_detected[final_sport] = matching_words
    
    
    if sports_priority:
        return sports_priority[0]
    else:
        return "Unknown"



new_inputs = preprocess_and_tokenize(new_texts).to(device)


model.eval()

# Disable gradient calculations for inference
with torch.no_grad():
    outputs = model(**new_inputs)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).cpu().numpy()


predictions = []

for i, text in tqdm(enumerate(new_texts), total=len(new_texts), desc="Processing texts"):
    players, teams = extract_entities(text)
    sentiment = get_sentiment(text)
    label_probs = {}
    predicted_labels = []

    for j, label in enumerate(label_list):
        custom_prob = probabilities[0][j] 
        label_probs[label] = custom_prob 
        # Determine predicted labels based on a threshold (e.g., 0.5)
        if custom_prob > 0.5:
            predicted_labels.append(label)

    # Sort the label probabilities from highest to lowest
    sorted_label_probs = {k: v for k, v in sorted(label_probs.items(), key=lambda item: item[1], reverse=True)}

    sport = extract_sports(text)  

    predictions.append({
        "Text": text,
        "Players": players,
        "Teams": teams,
        "Sentiment": sentiment,
        "Associated Sport": sport,
        "Predicted Labels": predicted_labels,
        "Label Probabilities": sorted_label_probs
    })
    

# Display the predictions along with probabilities and associated sports
for prediction in predictions:
    print(f"Text: {prediction['Text']}")
    print("\n")
    print(f"Players: {', '.join(prediction['Players'])}")
    print(f"Teams: {', '.join(prediction['Teams'])}")
    print(f"Sentiment: {'Positive' if prediction['Sentiment'] > 0 else 'Negative' if prediction['Sentiment'] < 0 else 'Neutral'}")
    print(f"Associated Sport: {prediction['Associated Sport']}")
    print(f"Predicted Labels: {', '.join(prediction['Predicted Labels'])}")
    print("\n")
    print("Label Probabilities:")
    for label, prob in prediction["Label Probabilities"].items():
        print(f"  {label}: {prob:.4f}")
    print("\n")


Processing texts: 100%|██████████| 1/1 [00:00<00:00, 16.38it/s]

Text: a big way to end it if ever there was a dream australian open finals match-up the 2017 men's championship decider was 
surely it few could believe the ageing Roger Federer and Rafael Nadal had each won six matches to arrive at the big dance especially
after injury interrupted seasons in 2016. Nadal had survived five setters against Alexander Zverev and Grigor Dimitrov to reach the final
while federer had to see off ki Nishikori and Stan Vavrinka to earn his spot it was the spaniard who led thehead-to-head 23-11 but federer
had the better record on hard courts with four ao crowns already under his belt we joined the action with nadal having opened up a three-love
lead in the second set but they're starting to hit his strap she's making that shot in the first set wasn't he[Applause]the 15 from that side got to 
keep an eye on that shot 


Players: Grigor Dimitrov, Roger Federer, Stan Vavrinka, Rafael Nadal, Alexander Zverev
Teams: 
Sentiment: Positive
Associated Sport: tennis
Predi




In [None]:
#general commentary

'''Smith is off to a strong start, leading the pack as they approach the first bend. He’s looking confident, 
maintaining a good rhythm over the hurdles. But wait—he seems to be drifting out of his lane! 
Smith is losing track! Oh no, he's clipped the hurdle! He’s stumbled, and now he's losing his balance! 
This is disastrous! The other runners are speeding past him. What a heartbreaking turn of events for Smith—he was in such great form, 
but one small mistake has cost him the race'''

'''With just under two minutes left in the fourth quarter, the Philadelphia 76ers are down by three. 
Embiid grabs the rebound off a missed shot and quickly outlets to Harden. Harden moves up the court, orchestrating the offense.
He dribbles around the perimeter, drawing in the defense with a quick crossover. There’s a pick set by Tucker, giving Harden the space he needs. 
He drives into the lane, but at the last second, dishes a perfect no-look pass to Maxey in the corner. Maxey sets his feet and fires a three... 
It's good! What a brilliant playmaking sequence by Harden! The Philadelphia 76ers tie the game, and the crowd is roaring! 
This is why Harden is considered one of the best playmakers in the game.'''

'''It's the second quarter of the game between the Washington Redskins and the New York Giants. Joe Theismann takes the snap,
looking for an open receiver. Here comes Taylor—he breaks through the line, and... oh no! A devastating hit from Lawrence Taylor! 
Theismann is down! He’s not getting up. The replay shows it all—Taylor came in hard, and Theismann’s leg is caught awkwardly under 
the tackle. That leg just bent in a way it shouldn’t. The medical staff is rushing onto the field, and the players are visibly shaken.
This is a serious injury. Theismann is being placed on a stretcher; he looks to be in excruciating pain. This could be career-ending. 
The crowd is silent, and even Taylor is signaling for the medical team. What a tragic moment for Theismann and the Redskins—an iconic 
player taken down by a horrific injury. '''

'''It's third and short for the Browns, and they’re looking to convert. The ball is snapped, and here comes Harrison, 
blitzing off the edge with incredible speed! The quarterback hands it off to the running back, but Harrison explodes through the line—BOOM!
What a tackle! He drives the runner back with a bone-crunching hit, stopping him dead in his tracks! 
You can hear the impact from the stands—Harrison's timing and power on that tackle were absolutely perfect. 
The ball carrier is down, and he looks shaken up after that hit. The Steelers’ defense comes up big on third down, 
thanks to a monstrous tackle by James Harrison! '''


In [None]:
#live commentary transcripts
'''James catches puts up a three won't gorebound Bosh back out to Allen history point of the thing tie game with five seconds
remaining Spurs do not have a timeout Parker to Neal four to shoot Arkansas who is ready to dunk for anyone so you can kind of see 
how it's affecting Tony Parker oh no we just saw me play them Cowboy Rose misses another.'''

'''a big way to end it if ever there was a dream australian open finals match-up the 2017 men's championship decider was 
surely it few could believe the ageing Roger Federer and Rafael Nadal had each won six matches to arrive at the big dance especially
after injury interrupted seasons in 2016. Nadal had survived five setters against Alexander Zverev and Grigor Dimitrov to reach the final
while federer had to see off ki Nishikori and Stan Vavrinka to earn his spot it was the spaniard who led thehead-to-head 23-11 but federer
had the better record on hard courts with four ao crowns already under his belt we joined the action with nadal having opened up a three-love
lead in the second set but they're starting to hit his strap she's making that shot in the first set wasn't he[Applause]the 15 from that side got to 
keep an eye on that shot '''

'''look we're looking to sweep you guys you wanted us you were crying out that you by passed the the harder team in my roast came down 
Brennan's left foot see him holding on to his knee holding on to his knee and down he was flying and he came down wrong on the left foot now 
whether it was an ankle or knee I do not know Gallants out there all those tweets running in this with the injury we just
talked about this teams he is missed with an assortment of injuries and now holding a knee late in a game that it's already decided for
all intents and purposes I'm sure everyone around the country is going to say wow why was he in it's in the game shall he comes down
on the left leg keep an eye on the leftleg there yep it was winning when he was planted that's when whatever happened to happened it's
here yes before he comes down it's the plant right there on that left leg if there was some give on that knee '''

In [None]:
# bigger text
'''he's charged for the single no there's always drama now they'll go for the single now she got that one's got a 100 or 
hashe's got to wait to see if you're not cool he made his ground he thinks he has[Music]and that's how he plays as Shikhar Dhawan
hoyt's one on to the leg side it's atough catch but easily taken on theboundarythere you gofor the momenthe's got 51 day international 50s
dead straight absolutely dead straight he's got x that's out of the middle doesn't get it [Applause] that's a good shot that is just full
of class [Applause] that's huge absolutely huge conceded in a world up well that is aamazing court and bold one-handed not in his arc 
doesn't get the power ms though he hits the toe of the bat that's very good this time over covers he finds the fielder Virat Kohli has
remained off strike for a long period of time[Applause]finds the get Raul that is brilliant what a magnificent tribals he's faced
 for 11 runsand India they bring up 350 Australia will need 353 for victory Australia of course defending champions and this is what 
 they need to win brilliant shots brilliant from Finch[Applause]that sounded good off the bat they'd be looking for two but uh oh
 we just about get it just about get it[Applause]that's it hard stayed on the back foot the ball was slow he's found the gap for four 
 outside edge the third man fielder doesn't get it[Applause]well that is probably one of the slowest 50s maybe the slowest ever that's 
 gone the distance[Applause]this time over midwicket that's exactly what i was saying it's against the breeze and Warner finds the man 
 at deep mid wicket it's a nice shot there's nobody behind[Applause]that's going to be four that brings up the 50 for this and to man 
 back like he did and that's as log sweep and that's going all the way they call him to deliverand boy it delivers every time oh that 
 should be gone oh god this one coming back in as well he's given a chance some running to do it's a good take it's safe that's a good
 area to hit his neck down the ground and straight ball spinning back into Kerry as he picked him out he has he has it'sa long boundary 
 carly agent gone diving forward Dhoni[Applause] Zampa will be the final wicket on the last ball of the game and that emotive man the 
 captain of India tells us the story it's a win to india by 36 runs you'''


'''Patrick Mahomes this after the touchdown hugs chiefs able to exhale power in light district back in kansas city going crazy with 
those great chiefs and sports fans[Applause]still two more time-outs left for the 49ers but down by 11. Garoppolo gets hit the ball 
comes outlooked like an incomplete pass but theylet them play on pressure by Frank Clark who just got the sack previously and it looked 
like an incomplete pass[Applause]i sure thought it was let's take a look at it it's good that yeah so yeah it's a forward pass a good 
jobby these officials letting it play out and garoppolo just trying to to buy sometime and let those guys get down the field where
he can try to make a play but you know you mentioned Andy Reed Joe, Andone of the most respected and most liked guys in all of 
football and really the history of the game there's so many people that have been touched by him and what he has meant to him i know his 
players love him he loved him in philadelphia they loved him when he was coaching at green bay and they certainly love him in kansas city
and he turned this program around second down in the very first year i know there's a lot of people really happyfor him to short of some 
miracle happening to come away with a super bowl win they did make the announcement that that last play was an incomplete pass not a fumble 
that kansas city chiefs super bowl four winning team 10 members of that team ended up in the hall of fame 17 total from the chiefs and 
the vikings that met at tulane stadium but how many players have come and gone great players great coaches for the chiefs that have not
been able to taste this moment this is pit Kendall Fuller up to get it and that will end this game with under a minute to gohey
patrick mahomes and the kansas city chiefs now with three straight post-season wins after trailing by 10 or more points[Applause]
and Andy Reid gets to celebrate with his team timeout taken by Kyle Shanahan of the 49ers this moment belongs to kansas city the chiefs 
andy reid the hunt familybut  how about the job done on the other sideline a team that won four games last year the number one seed 
getting to the superbowl and coming up shortand kyle shanahan's third year as the head man with san francisco these are tough 
endings when you're notable to win the last game of the season you've had such a great year to get to this point this was a great 
season by san francisco a lot to be proud of but it will be hard for them to look at it that way after this game it'll take some time
but for the kansas city chiefs hats off to them and the man known as big red is going to be a super bowl champ and big red just called
his stud third year quarterback patrick mahomes a reigning mvp over to the sideline great quarterbacks win big games and
patrick mahomes was bottled upmost of the night but came alive latemade plays on the move and he and these chiefs will walk out of here 
winners of superbowl 54.'''