In [None]:
import pandas as pd
import numpy as np
import time
import sys
from flair.models import TextClassifier
from flair.data import Sentence
from tqdm import tqdm
import os
from tabulate import tabulate
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from emoji import demojize
import multiprocessing as mp
from multiprocessing import Manager


# Load Flair's pre-trained sentiment classifier
print("Loading Flair model...")
start_time = time.time()
classifier = TextClassifier.load('en-sentiment')
classifier.to('cpu')  # Explicitly use CPU
print(f"Model loaded in {time.time() - start_time:.2f} seconds.")

In [None]:
start_time = time.time()
usecols = ['comment_id', 'comment_text', 'comment_author', 'comment_time', 'comment_score', 'match_time', 'matchday', 'home_team', 'away_team', 'winner']
df = pd.read_csv('comments_data.csv', usecols=usecols, dtype={'comment_id': 'object', 'comment_score': 'float64'})

In [None]:
print("Inspecting comment_score column:")
print(df['comment_score'].head(10))  # Show first 10 values
print("Data type:", df['comment_score'].dtype)
print("Number of non-numeric values (NaN):", df['comment_score'].isna().sum())
print("Unique values in comment_score:", df['comment_score'].unique()[:20])  # Show first 20 unique values

# Convert comment_score to int32 after loading
df['comment_score'] = pd.to_numeric(df['comment_score'], errors='coerce').astype('int32', errors='ignore')
print(f"Loaded dataset with {len(df)} rows in {time.time() - start_time:.2f} seconds.")

In [None]:
# Custom mapping for emoticons or emojis not handled by demojize
custom_emote_map = {
    ':-)': ':smiley:',
    ':)': ':smiley:',
    ':-(': ':frowning_face:',
    ':(': ':frowning_face:',
    '>:(': ':angry_face:',
    # Add more as needed based on your dataset
}

def preprocess_emotes_batch(comments):
    # Vectorized preprocessing to convert emojis to text descriptions
    comments = comments.apply(lambda x: demojize(x) if pd.notna(x) else x)
    # Apply custom mappings
    for emote, desc in custom_emote_map.items():
        comments = comments.str.replace(emote, desc, regex=False)
    return comments

# Function to get sentiment using Flair with three classes, emoji preprocessing, and slang override
def get_flair_sentiment(text, neutral_threshold=0.6):
    if pd.isna(text) or text.strip() == '[deleted]':
        return None, None
    # Preprocess text to convert emotes (single text version for consistency)
    processed_text = preprocess_emotes_batch(pd.Series([text]))[0]
    sentence = Sentence(processed_text)
    try:
        classifier.predict([sentence])  # Batch prediction with single sentence
        label = sentence.labels[0]
        score = label.score
        sentiment = label.value
    except Exception as e:
        print(f"Error predicting sentiment for '{text}': {e}")
        return None, None
    
    # Override for known negative slang
    negative_slang = ['ass', 'trash', 'garbage', 'shit', 'awful', 'terrible', 'horrible']
    if any(word in processed_text.lower() for word in negative_slang) and sentiment == "POSITIVE":
        sentiment = "NEGATIVE"
    
    # Infer NEUTRAL if confidence is below threshold
    if score < neutral_threshold:
        return "NEUTRAL", score
    return sentiment, score

# Function to analyze a newly typed sentence with Flair
def analyze_new_sentence(text):
    sentiment, score = get_flair_sentiment(text)
    print(f"\nAnalysis of new sentence: '{text}'")
    print(f"Processed text: '{preprocess_emotes_batch(pd.Series([text]))[0]}'")
    print(f"Sentiment: {sentiment}")
    print(f"Confidence: {score:.4f}")
    return sentiment, score

In [None]:
# Function to initialize the worker process with the classifier
def init_worker():
    global classifier
    classifier = TextClassifier.load('en-sentiment')
    classifier.to('cpu')

# Function to process a single batch with Flair batch prediction
def process_batch(batch):
    global classifier  # Access the classifier initialized in the worker
    # Preprocess all comments in the batch at once
    processed_texts = preprocess_emotes_batch(batch['comment_text'])
    sentences = [Sentence(text) for text in processed_texts.dropna()]
    if not sentences:
        return pd.DataFrame()
    try:
        classifier.predict(sentences)
        results = []
        for i, row in batch.iterrows():
            comment = row['comment_text']
            processed_text = processed_texts.iloc[i]
            if pd.isna(comment) or comment.strip() == '[deleted]':
                sentiment, score = None, None
            else:
                sentiment = sentences[i].labels[0].value if i < len(sentences) else None
                score = sentences[i].labels[0].score if i < len(sentences) else None
                if score is not None and score < 0.6:
                    sentiment = "NEUTRAL"
                negative_slang = ['ass', 'trash', 'garbage', 'shit', 'awful', 'terrible', 'horrible']
                if sentiment == "POSITIVE" and any(word in processed_text.lower() for word in negative_slang):
                    sentiment = "NEGATIVE"
            results.append({
                'comment_id': row['comment_id'],
                'comment_text': comment,
                'sentiment': sentiment,
                'confidence': score,
                'comment_author': row['comment_author'],
                'comment_time': row['comment_time'],
                'comment_score': row['comment_score'],
                'match_time': row['match_time'],
                'matchday': row['matchday'],
                'home_team': row['home_team'],
                'away_team': row['away_team'],
                'winner': row['winner']
            })
        return pd.DataFrame(results)
    except Exception as e:
        print(f"Error processing batch: {e}")
        return pd.DataFrame()
    
# Parallel processing of batches using multiprocessing
print("Processing dataset with multiprocessing...")
start_time = time.time()
batch_size = 10000  # Optimized for memory and speed
batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]
print(f"Number of batches: {len(batches)}")

# Use multiprocessing Pool with worker initialization
with mp.Pool(processes=mp.cpu_count(), initializer=init_worker) as pool:
    results = []
    with tqdm(total=len(batches), desc="Processing sentiment", position=0, leave=True) as pbar:
        for i, result in enumerate(pool.imap(process_batch, batches)):
            print(f"Processed batch {i + 1}/{len(batches)}")  # Debug output
            results.append(result)
            pbar.update(1)  # Manually update the progress bar
            time.sleep(0.1)

results_df = pd.concat(results, ignore_index=True)
print(f"Processing completed in {time.time() - start_time:.2f} seconds.")

In [None]:
def detect_teams_vectorized(comment_series, home_team_series, away_team_series, processed_series):
    def detect_single(comment, home_team, away_team, processed_text):
        if pd.isna(comment):
            return [None]
        comment = processed_text.lower()
        teams = []
        home_team_lower = home_team.lower()
        away_team_lower = away_team.lower()
        if home_team_lower in comment or any(kw in comment for kw in [home_team_lower[:4], f"{home_team_lower.split()[0]}"]):
            teams.append(home_team)
        if away_team_lower in comment or any(kw in comment for kw in [away_team_lower[:4], f"{away_team_lower.split()[0]}"]):
            teams.append(away_team)
        
        team_keywords = {
            'Arsenal': ['arsenal', 'afc', 'gooners'],
            'Aston Villa': ['aston villa', 'villa', 'avfc'],
            'Bournemouth': ['bournemouth', 'afcb', 'cherries'],
            'Brentford': ['brentford', 'bfc', 'bees'],
            'Brighton & Hove Albion': ['brighton', 'bha', 'seagulls'],
            'Chelsea': ['chelsea', 'cfc', 'blues'],
            'Crystal Palace': ['crystal palace', 'palace', 'cpfc'],
            'Everton': ['everton', 'efc', 'toffees'],
            'Fulham': ['fulham', 'ffc', 'cottagers'],
            'Ipswich Town': ['ipswich', 'itfc', 'tractor boys'],
            'Leicester City': ['leicester', 'lcfc', 'foxes'],
            'Liverpool': ['liverpool', 'lfc', 'reds'],
            'Manchester City': ['manchester city', 'city', 'mcfc'],
            'Manchester United': ['manchester united', 'united', 'man utd', 'mufc'],
            'Newcastle United': ['newcastle', 'nufc', 'magpies'],
            'Nottingham Forest': ['nottingham forest', 'forest', 'nffc'],
            'Southampton': ['southampton', 'saints', 'sfc'],
            'Tottenham Hotspur': ['tottenham', 'spurs', 'thfc'],
            'West Ham United': ['west ham', 'whu', 'hammers'],
            'Wolverhampton Wanderers': ['wolves', 'wwfc', 'wanderers']
        }
        
        for team, keywords in team_keywords.items():
            if any(keyword in comment for keyword in keywords):
                if team not in teams:
                    teams.append(team)
        return teams if teams else [None]

    return [detect_single(comment, home_team, away_team, processed) 
            for comment, home_team, away_team, processed in zip(comment_series, home_team_series, away_team_series, processed_series)]

# Apply team detection
print("Detecting teams...")
start_time = time.time()
results_df['teams_mentioned'] = detect_teams_vectorized(results_df['comment_text'], results_df['home_team'], results_df['away_team'], results_df['processed_text'])
results_exploded = results_df.explode('teams_mentioned')
print(f"Team detection completed in {time.time() - start_time:.2f} seconds.")

# Time-based preprocessing (parallelizable with joblib for large datasets)
print("Performing time-based preprocessing...")
start_time = time.time()
def process_time_chunk(chunk):
    chunk['comment_datetime'] = pd.to_datetime(chunk['comment_time'], unit='s')
    chunk['match_datetime'] = pd.to_datetime(chunk['match_time'], errors='coerce')
    chunk['time_diff'] = (chunk['comment_datetime'] - chunk['match_datetime']).dt.total_seconds() / 60
    return chunk

chunk_size = 10000
chunks = [results_exploded[i:i + chunk_size] for i in range(0, len(results_exploded), chunk_size)]
with Parallel(n_jobs=num_processes) as parallel:
    results_chunks = parallel(delayed(process_time_chunk)(chunk) for chunk in chunks)
results_exploded = pd.concat(results_chunks, ignore_index=True)
print(f"Time preprocessing completed in {time.time() - start_time:.2f} seconds.")

In [None]:
def analyze_by_team(df):
    print("\nSentiment by Team (Top 10 Mentioned):")
    team_sentiment = df.groupby('teams_mentioned')['sentiment'].value_counts().unstack(fill_value=0)
    top_teams = team_sentiment.sum(axis=1).nlargest(10).index
    team_sentiment = team_sentiment.loc[top_teams].reset_index()
    print(tabulate(team_sentiment, headers='keys', tablefmt='grid', showindex=False))

    team_sentiment_plot = team_sentiment.set_index('teams_mentioned')
    team_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='viridis')
    plt.title('Sentiment Distribution by Top 10 Teams')
    plt.xlabel('Team')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()

    print("\nAverage Confidence by Team (Top 10):")
    confidence_by_team = df.groupby('teams_mentioned')['confidence'].mean().dropna()
    confidence_table = confidence_by_team.loc[top_teams].reset_index()
    confidence_table['confidence'] = confidence_table['confidence'].round(4)
    print(tabulate(confidence_table, headers=['Team', 'Average Confidence'], tablefmt='grid', showindex=False))

def analyze_by_matchday(df):
    print("\nSentiment by Matchday:")
    matchday_sentiment = df.groupby('matchday')['sentiment'].value_counts().unstack(fill_value=0).reset_index()
    print(tabulate(matchday_sentiment, headers='keys', tablefmt='grid', showindex=False))

    matchday_sentiment_plot = matchday_sentiment.set_index('matchday')
    matchday_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='plasma')
    plt.title('Sentiment Distribution by Matchday')
    plt.xlabel('Matchday')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()

def analyze_temporal_trends(df):
    print("\nSentiment by Time Since Match Start (15-min intervals):")
    df['time_bin'] = pd.cut(df['time_diff'], bins=range(0, 121, 15), right=False)
    time_sentiment = df.groupby('time_bin')['sentiment'].value_counts().unstack(fill_value=0).reset_index()
    print(tabulate(time_sentiment, headers='keys', tablefmt='grid', showindex=False))

    time_sentiment_plot = time_sentiment.set_index('time_bin')
    time_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='magma')
    plt.title('Sentiment Distribution by Time Since Match Start')
    plt.xlabel('Time Bin (minutes)')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()

def analyze_winner_effect(df):
    print("\nSentiment by Match Winner:")
    winner_sentiment = df.groupby('winner')['sentiment'].value_counts().unstack(fill_value=0).reset_index()
    print(tabulate(winner_sentiment, headers='keys', tablefmt='grid', showindex=False))

    winner_sentiment_plot = winner_sentiment.set_index('winner')
    winner_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='inferno')
    plt.title('Sentiment Distribution by Match Winner')
    plt.xlabel('Winner')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()

def analyze_by_scenarios(df):
    print("\nSentiment Distribution by Scenarios:")

    # Scenario 1: Goal Proximity
    goal_times = [15, 30, 60, 75, 90]
    goal_window = 5
    df['near_goal'] = df['time_diff'].apply(lambda x: any(abs(x - gt) <= goal_window for gt in goal_times))
    print("\nScenario 1: Comments Near Likely Goal Times (within 5 minutes):")
    goal_sentiment = df[df['near_goal'] == True]['sentiment'].value_counts().reindex(['POSITIVE', 'NEGATIVE', 'NEUTRAL'], fill_value=0).reset_index()
    goal_sentiment.columns = ['Sentiment', 'Count']
    print(tabulate(goal_sentiment, headers='keys', tablefmt='grid', showindex=False))

    plt.figure(figsize=(6, 6))
    plt.pie(goal_sentiment['Count'], labels=goal_sentiment['Sentiment'], autopct='%1.1f%%', colors=['green', 'red', 'gray'])
    plt.title('Sentiment Distribution Near Goal Times')
    plt.show()

    # Scenario 2: Halftime
    df['is_halftime'] = df['time_diff'].between(45, 60)
    print("\nScenario 2: Comments During Halftime (45-60 minutes):")
    halftime_sentiment = df[df['is_halftime'] == True]['sentiment'].value_counts().reindex(['POSITIVE', 'NEGATIVE', 'NEUTRAL'], fill_value=0).reset_index()
    halftime_sentiment.columns = ['Sentiment', 'Count']
    print(tabulate(halftime_sentiment, headers='keys', tablefmt='grid', showindex=False))

    plt.figure(figsize=(6, 6))
    plt.pie(halftime_sentiment['Count'], labels=halftime_sentiment['Sentiment'], autopct='%1.1f%%', colors=['green', 'red', 'gray'])
    plt.title('Sentiment Distribution During Halftime')
    plt.show()

    # Scenario 3: Match Outcome
    print("\nScenario 3: Sentiment by Match Outcome (Winner):")
    outcome_sentiment = df.groupby('winner')['sentiment'].value_counts().unstack(fill_value=0).reset_index()
    print(tabulate(outcome_sentiment, headers='keys', tablefmt='grid', showindex=False))

    outcome_sentiment_plot = outcome_sentiment.set_index('winner')
    outcome_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='cividis')
    plt.title('Sentiment Distribution by Match Winner')
    plt.xlabel('Winner')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()

    # Scenario 4: Matchday Trends
    print("\nScenario 4: Sentiment by Matchday (Summary):")
    matchday_sentiment = df.groupby('matchday')['sentiment'].value_counts().unstack(fill_value=0).reset_index()
    print(tabulate(matchday_sentiment, headers='keys', tablefmt='grid', showindex=False))

    matchday_sentiment_plot = matchday_sentiment.set_index('matchday')
    matchday_sentiment_plot.plot(kind='bar', figsize=(10, 6), colormap='cool')
    plt.title('Sentiment Distribution by Matchday')
    plt.xlabel('Matchday')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()


In [None]:
# Execute analysis with tables and visualizations
print("Overall Sentiment Distribution:")
overall_sentiment = results_exploded['sentiment'].value_counts().reset_index()
overall_sentiment.columns = ['Sentiment', 'Count']
print(tabulate(overall_sentiment, headers='keys', tablefmt='grid', showindex=False))

# Visualization: Pie chart for overall sentiment
plt.figure(figsize=(6, 6))
plt.pie(overall_sentiment['Count'], labels=overall_sentiment['Sentiment'], autopct='%1.1f%%', colors=['green', 'red', 'gray'])
plt.title('Overall Sentiment Distribution')
plt.show()

analyze_by_team(results_exploded)
analyze_by_matchday(results_exploded)
analyze_temporal_trends(results_exploded)
analyze_winner_effect(results_exploded)
analyze_by_scenarios(results_exploded)

In [None]:
# Analyze a new sentence
new_sentence = "Good pass Licha Brilliant from Amad in terms of the run, rounding the keeper, and finishing from a tight angle IDK what Ederson is doing (as usual) 😊"
analyze_new_sentence(new_sentence)