# Statistic Analysis

## Inference Speed


In [None]:
import pandas as pd
from transformers import pipeline
import time

# Read CSV file
file_path = "https://raw.githubusercontent.com/Xintong1122/Decentraland_LLM/main/Data/message_with_sentiment.csv"
data = pd.read_csv(file_path)
data = data.dropna(subset=['Content'])

# Extract text data
content_data = data['Content'].tolist()

# Define sentiment analysis models to use
models = ["cardiffnlp/twitter-roberta-base-sentiment-latest",
          "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
          "cardiffnlp/twitter-xlm-roberta-base-sentiment"]

results = {}

for model_name in models:
    sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)

    # Calculate inference time
    start_time = time.time()
    predictions = sentiment_pipeline(content_data)
    end_time = time.time()

    # Calculate inference speed
    inference_speed = len(content_data) / (end_time - start_time)

    # Store results
    results[model_name] = {
        'inference_speed': inference_speed
    }

# Print results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Inference Speed: {metrics['inference_speed']} samples/sec")
    print()


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Model: cardiffnlp/twitter-roberta-base-sentiment-latest
Inference Speed: 6.207552047745031 samples/sec

Model: lxyuan/distilbert-base-multilingual-cased-sentiments-student
Inference Speed: 11.395455680685487 samples/sec

Model: cardiffnlp/twitter-xlm-roberta-base-sentiment
Inference Speed: 6.044647620063329 samples/sec



## Sentiment Classification Bar Chart

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Read CSV file from GitHub
file_path = "https://raw.githubusercontent.com/Xintong1122/Decentraland_LLM/main/Data/message_with_sentiment.csv"
data = pd.read_csv(file_path)

# Extract relevant columns
labels = ['positive', 'neutral', 'negative']
llms = ['LLM1', 'LLM2', 'LLM3']

# Initialize counters
counts = {label: {llm: 0 for llm in llms} for label in labels}

# Calculate the number of each sentiment label for each LLM
for llm in llms:
    for label in labels:
        counts[label][llm] = (data[llm+'_Label'] == label).sum()

# Prepare data for plotting
llm1_counts = [counts[label]['LLM1'] for label in labels]
llm2_counts = [counts[label]['LLM2'] for label in labels]
llm3_counts = [counts[label]['LLM3'] for label in labels]

# Prepare text to display on each bar and set font settings
llm1_text = [f'{count}' for count in llm1_counts]
llm2_text = [f'{count}' for count in llm2_counts]
llm3_text = [f'{count}' for count in llm3_counts]

colors_llm1 = '#6d93cb'
colors_llm2 = '#f3776d'
colors_llm3 = '#57c3be'

# Plot grouped bar chart, set text color to white
fig = go.Figure(data=[
    go.Bar(name='LLM1', x=labels, y=llm1_counts, text=llm1_text, textposition='auto', textfont_color='white', marker_color=colors_llm1),
    go.Bar(name='LLM2', x=labels, y=llm2_counts, text=llm2_text, textposition='auto', textfont_color='white', marker_color=colors_llm2),
    go.Bar(name='LLM3', x=labels, y=llm3_counts, text=llm3_text, textposition='auto', textfont_color='white', marker_color=colors_llm3)
])

# Modify layout and style
fig.update_layout(
    barmode='group',
    xaxis_title='Sentiment Label',
    yaxis_title='Number',
    height=500,
    width=550,
    title='Distribution of the Number of Sentiment Categories',
    bargroupgap=0.1  # Set gap between bars of different groups
)

# Show the chart
fig.show()


## Sentiment Score Heatmap

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import make_colorscale

# Read CSV file
file_path = "https://raw.githubusercontent.com/Xintong1122/Decentraland_LLM/main/Data/message_with_sentiment.csv"
data = pd.read_csv(file_path)

# Extract relevant columns
labels = ['positive', 'neutral', 'negative']
llms = ['LLM1', 'LLM2', 'LLM3']
llm_labels = ['LLM1_Label', 'LLM2_Label', 'LLM3_Label']
llm_scores = ['LLM1_Score', 'LLM2_Score', 'LLM3_Score']

# Create a DataFrame to store average scores of each LLM for each sentiment label
heatmap_data = pd.DataFrame(index=llms, columns=labels)

# Calculate average scores of each LLM for each sentiment label
for i, llm in enumerate(llms):
    for label in labels:
        scores = data[llm_scores[i]][data[llm_labels[i]] == label].astype(float)
        heatmap_data.at[llm, label] = scores.mean()

# Ensure all data is float and handle missing values
heatmap_data = heatmap_data.astype(float).fillna(0)

# Convert data to a format suitable for Plotly
heatmap_data = heatmap_data.reset_index().melt(id_vars=['index'], var_name='Label', value_name='Score')
heatmap_data.columns = ['LLM', 'Label', 'Score']

# Create custom color scale
colorscale = make_colorscale(["#2256a7","#ffffff", "#f47b00"])

# Use Plotly to plot the density heatmap
fig = px.density_heatmap(
    heatmap_data,
    x='Label',
    y='LLM',
    z='Score',
    color_continuous_scale=colorscale,
    range_color=[0.4, 0.8],
    title='Average Sentiment Scores by LLM and Label',
    text_auto=True
)

fig.update_layout(
    autosize=False,
    height=500,
    width=550
)

# Show the heatmap
fig.show()


## Sentiment Distribution Box Plot

In [None]:
import pandas as pd
import plotly.express as px

# Read data
file_path = "https://raw.githubusercontent.com/Xintong1122/Decentraland_LLM/main/Data/message_with_sentiment.csv"
data = pd.read_csv(file_path)

# Extract relevant columns
llms = ['LLM1', 'LLM2', 'LLM3']
labels = ['positive', 'neutral', 'negative']

# Initialize an empty list to store organized data
rows = []

# Iterate through each sentiment label and each LLM, extract score data
for label in labels:
    for llm in llms:
        col_label = f"{llm}_Label"
        col_score = f"{llm}_Score"
        subset = data.loc[data[col_label] == label, col_score].reset_index(drop=True)  # Reset index
        for score in subset:
            rows.append({'LLM': llm, 'Label': label, 'Score': score})

# Create a new dataframe
df_box = pd.DataFrame(rows)

# Custom color scheme
custom_colors = ['#6d93cb', '#f3776d', '#57c3be']

# Plot box plot
fig = px.box(df_box, x='Label', y='Score', color='LLM',
             category_orders={'Label': ['positive', 'neutral', 'negative']},
             title='Sentiment Score Distribution',
             points='all',  # Show all data points
             color_discrete_sequence=custom_colors)  # Use custom color sequence

# Adjust layout
fig.update_layout(
    xaxis_title='Sentiment Label',
    yaxis_title='Score',
    legend_title='LLM',
    height=500,
    width=550
)

# Show the chart
fig.show()


## Daily Average Sentiment Score Line Chart

In [None]:
import pandas as pd
import plotly.express as px

# Read CSV file from GitHub
url = 'https://raw.githubusercontent.com/Xintong1122/Decentraland_LLM/main/Data/daily_sentiment_scores.csv'
df = pd.read_csv(url)

# Rename columns
df = df.rename(columns={
    'LLM1_Sentiment_Score': 'LLM1',
    'LLM2_Sentiment_Score': 'LLM2',
    'LLM3_Sentiment_Score': 'LLM3'
})

# Plot line chart
fig = px.line(df, x='Date', y=['LLM1', 'LLM2', 'LLM3'],
              labels={'value': 'Sentiment Score', 'variable': 'LLM Model'},
              title='Sentiment Scores over Time')

# Set line colors
fig.update_traces(line=dict(color='#6d93cb'), selector=dict(name='LLM1'))
fig.update_traces(line=dict(color='#f3776d'), selector=dict(name='LLM2'))
fig.update_traces(line=dict(color='#57c3be'), selector=dict(name='LLM3'))

# Set chart size
fig.update_layout(
    height=500,
    width=550
)

# Show the chart
fig.show()
