In [52]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [53]:
df_sentiment = pd.read_csv('gpt_sentiment_analysis.csv')
# string number
df_sentiment['number'] = df_sentiment['number'].astype(str)
# remove one extreme value
df_sentiment = df_sentiment.query(f"combine_subjectivity < 0.66")

In [54]:
# count review words
df_normal = pd.read_excel('gpt_answer.xlsx', sheet_name='Sheet2')
df_kind = pd.read_excel('gpt_answer.xlsx', sheet_name='Sheet3')
df_harsh = pd.read_excel('gpt_answer.xlsx', sheet_name='Sheet4')
df_real = pd.read_csv('real_reviews.csv')[['number', 'review_id', 'summary', 'strengths_and_weaknesses', 'questions', 'limitations']]

df_word = []
for t in [df_normal, df_kind, df_harsh]:
    t = t.melt(id_vars=['confid'])
    # split section by first underline
    t['tone'] = t['variable'].str.split('_', n=1).str[0]
    t['section'] = t['variable'].str.split('_', n=1).str[1]
    t['count'] = t['value'].str.split().str.len()
    t['number'] = t['confid'].astype(str)
    t['review_id'] = 1
    df_word.append(t)
    
t = df_real.melt(id_vars=['number', 'review_id'])
t['tone'] = 'real'
t['section'] = t['variable']
t['count'] = t['value'].str.split().str.len()
t['number'] = t['number'].astype(str)
df_word.append(t)

df_word = pd.concat(df_word)
df_word['count'] = df_word['count'].fillna(0)
df_word = df_word[['number', 'review_id', 'tone', 'section', 'count']].query("section != 'rating' and section != 'confidence'")
df_word

Unnamed: 0,number,review_id,tone,section,count
20,3025,1,normal,summary,45.0
21,5769,1,normal,summary,63.0
22,6510,1,normal,summary,69.0
23,6640,1,normal,summary,66.0
24,7169,1,normal,summary,77.0
...,...,...,...,...,...
131,8594,3,real,limitations,51.0
132,6640,1,real,limitations,15.0
133,6640,2,real,limitations,1.0
134,6640,3,real,limitations,16.0


In [55]:
# total word
df_word_total = df_word.groupby(['number', 'review_id', 'tone']).sum().reset_index()
df_word_total


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,number,review_id,tone,count
0,10162,1,harsh,251.0
1,10162,1,kind,199.0
2,10162,1,normal,185.0
3,10162,1,real,651.0
4,10162,2,real,426.0
...,...,...,...,...
59,9378,1,normal,174.0
60,9378,1,real,466.0
61,9378,2,real,454.0
62,9378,3,real,1090.0


In [56]:
# recommendation

In [57]:
df_recom = pd.read_csv('recommendation_res.csv')

In [58]:
# heatmap
for tone in ['normal', 'kind', 'harsh']:
    fig = go.Figure()
    t = df_recom.groupby(['human_recommendation', f'gpt_{tone}']).agg({'number': 'count'}).reset_index()
    t = t.pivot(index='human_recommendation', columns=f'gpt_{tone}', values='number').fillna(0)
    t['reject'] = 0
    # reorder row index
    sorter = ['accept', 'weak accept', 'reject']
    t = t.reindex(sorter).loc[sorter]
    fig.add_trace(go.Heatmap(z=t.values, x=t.columns, y=t.index, colorscale='Viridis_r', zmin=0, zmax=20, text=t.values, texttemplate="%{text}", showscale=False))
    fig.update_layout(xaxis_title=f'GPT recommendation ({tone})', yaxis_title='Human recommendation', template='plotly_white', width=350, height=350)
    # reverse y axis
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [59]:
# percentage of agreement
for tone in ['normal', 'kind', 'harsh']:
    agree = (df_recom['human_recommendation'] == df_recom[f'gpt_{tone}'])
    print(f"Agreement rate ({tone}): {agree.mean()}")

Agreement rate (normal): 0.47058823529411764
Agreement rate (kind): 0.5882352941176471
Agreement rate (harsh): 0.35294117647058826


In [60]:
ts = []
for tone in ['normal', 'kind', 'harsh']:
    t = df_recom.groupby(['final_result', f'gpt_{tone}']).agg({'number': 'nunique'}).reset_index()
    t['tone'] = tone
    t = t.rename(columns={'final_result': 'human_recommendation', f'gpt_{tone}': 'gpt_recommendation'})
    ts.append(t)
    
t = pd.concat(ts)

# plot by human recommendation
print(t)
data = t.query("human_recommendation == 'Accept'")
print(data)
# stacked bar
fig = px.bar(data, x='tone', y='number', color='gpt_recommendation', barmode='stack', text='number', title='Accept', template='plotly_white')
fig.update_layout(yaxis_title='Number of papers', xaxis_title='Tone', width=400, height=380)
fig.show()

data = t.query("human_recommendation == 'Reject'")
# stacked bar
fig = px.bar(data, x='tone', y='number', color='gpt_recommendation', barmode='stack', text='number', title='Reject', template='plotly_white')
fig.update_layout(yaxis_title='Number of papers', xaxis_title='Tone', width=400, height=380)
fig.show()

  human_recommendation gpt_recommendation  number    tone
0               Accept             accept       4  normal
1               Accept        weak accept       1  normal
2               Reject             accept       4  normal
3               Reject        weak accept       1  normal
0               Accept             accept       4    kind
1               Accept        weak accept       1    kind
2               Reject             accept       3    kind
3               Reject        weak accept       2    kind
0               Accept             accept       4   harsh
1               Accept        weak accept       1   harsh
2               Reject             accept       5   harsh
  human_recommendation gpt_recommendation  number    tone
0               Accept             accept       4  normal
1               Accept        weak accept       1  normal
0               Accept             accept       4    kind
1               Accept        weak accept       1    kind
0             

In [61]:
# overall subjectivity & polarity

In [62]:
for metric in ["subjectivity", "polarity"]:

    t = df_sentiment[["number", "review_id", "tone", f"combine_{metric}"]]
    # append word count
    t = t.merge(df_word_total, on=["number", "review_id", "tone"], how="left")
    # append final result
    t = t.merge(
        df_recom[["number", "final_result"]]
        .drop_duplicates()
        .astype({"number": "str"}),
        on=["number"],
        how="left",
    )

    # sort t by final result
    t = t.sort_values(["final_result", "tone"])
    print(t.head())

    title = f"Overall sentiment" if metric == "polarity" else f"Overall subjectivity"
    fig = px.scatter(
        t,
        x="number",
        y=f"combine_{metric}",
        color="tone",
        symbol="tone",
        title=title,
    )
    fig.update_traces(marker=dict(size=12, opacity=0.7))

    fig.update_layout(
        template="simple_white",
        width=800,
        height=400,
        font=dict(size=12, color="black"),
        legend_title_text=None,
    )
    fig.update_xaxes(
        title=None,
        tickvals=np.arange(0, len(t["number"])),
        ticktext=[f"{r} {int(i+1)}" for r in ["Accept", "Reject"] for i in range(5)],
    )
    fig.show()

   number  review_id   tone  combine_subjectivity  count final_result
11   5769          1  harsh              0.535385  264.0       Accept
12   6510          1  harsh              0.540888  203.0       Accept
14   7169          1  harsh              0.458665  391.0       Accept
16   9378          1  harsh              0.497588  273.0       Accept
17  10162          1  harsh              0.438651  251.0       Accept


   number  review_id   tone  combine_polarity  count final_result
11   5769          1  harsh          0.129496  264.0       Accept
12   6510          1  harsh          0.221841  203.0       Accept
14   7169          1  harsh          0.065995  391.0       Accept
16   9378          1  harsh          0.210526  273.0       Accept
17  10162          1  harsh          0.133730  251.0       Accept


In [63]:
result = {"metric": [], 'tone': [], 'coef': [], 'conf_int': []}

In [64]:
# significance test
import statsmodels.api as sm
import statsmodels.formula.api as smf

for gpt_tone in ["kind", "normal", "harsh"]:
    print(f"Significance test for {gpt_tone}")
    t = df_sentiment.query(f'tone == "{gpt_tone}" or tone == "real"')[
        ["number", 'review_id', 'tone', "combine_subjectivity"]
    ]
    # append word count
    t = t.merge(df_word_total, on=["number", 'review_id', "tone"], how="left")

    t['y'] = t['combine_subjectivity']
    t['x'] = t['tone'].apply(lambda x: 0 if x == 'real' else 1)
    
    md = smf.mixedlm("y ~ x", t, groups=t['number'])
    mdf = md.fit()

    # extract coef and confidence interval
    params = mdf.params
    coef = params['x']
    conf_int = mdf.conf_int().loc['x'].values
    result['metric'].append('subjectivity')
    result['tone'].append(gpt_tone)
    result['coef'].append(coef)
    result['conf_int'].append(conf_int)
    


Significance test for kind
Significance test for normal
Significance test for harsh



The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.



In [65]:
# significance test
import statsmodels.api as sm
import statsmodels.formula.api as smf

for gpt_tone in ["kind", "normal", "harsh"]:
    print(f"Significance test for {gpt_tone}")
    t = df_sentiment.query(f'tone == "{gpt_tone}" or tone == "real"')[
        ["number", 'review_id', 'tone', "combine_polarity"]
    ]
    # append word count
    t = t.merge(df_word_total, on=["number", 'review_id', "tone"], how="left")
    
    t['y'] = t['combine_polarity']
    t['x'] = t['tone'].apply(lambda x: 0 if x == 'real' else 1)

    md = smf.mixedlm("y ~ x", t, groups=t['number'])
    mdf = md.fit()
    print(mdf.summary())
    
    # extract coef and confidence interval
    params = mdf.params
    coef = params['x']
    conf_int = mdf.conf_int().loc['x'].values
    result['metric'].append('sentiment')
    result['tone'].append(gpt_tone)
    result['coef'].append(coef)
    result['conf_int'].append(conf_int)

Significance test for kind
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0023 
Min. group size:  3       Log-Likelihood:     63.6039
Max. group size:  5       Converged:          Yes    
Mean group size:  4.3                                
-----------------------------------------------------
            Coef. Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept   0.084    0.008 10.670 0.000  0.069  0.100
x           0.045    0.017  2.628 0.009  0.011  0.078
Group Var   0.000                                    

Significance test for normal



Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


The Hessian matrix at the estimated parameter values is not positive definite.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0018 
Min. group size:  3       Log-Likelihood:     68.6800
Max. group size:  5       Converged:          Yes    
Mean group size:  4.3                                
-----------------------------------------------------
            Coef. Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept   0.084    0.007 11.470 0.000  0.070  0.099
x           0.039    0.015  2.526 0.012  0.009  0.068
Group Var   0.000                                    

Significance test for harsh
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0021 
Min. group size:  3       Log-Likelihood:     65.2596
Max. group size


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


The Hessian matrix at the estimated parameter values is not positive definite.


The MLE may be on the boundary of the parameter space.



In [66]:
data = pd.DataFrame(result)
colors = px.colors.qualitative.Plotly

# plot line
for i, metric in enumerate(['subjectivity', 'sentiment']):
    fig = go.Figure()
    t = data.query(f'metric == "{metric}"')
    fig.add_trace(go.Scatter(y=t['tone'], x=t['coef'], mode='markers', name=tone, error_x=dict(type='data', array=t['conf_int'].apply(lambda x: x[1]-x[0])/2), marker=dict(color=colors[i])))
    fig.update_layout(template='plotly_white', width=400, height=300, title=metric)
    fig.update_xaxes(range=[-0.2, 0.2], title=f"coef. (GPT - human)", zeroline=True, zerolinewidth=1, zerolinecolor='black')
    fig.update_yaxes(categoryorder="array", categoryarray=["harsh", "normal", "kind", ])
    fig.show()

In [67]:
# by sections

In [68]:
result = {"section": [], "metric": [], 'tone': [], 'coef': [], 'conf_int': []}

In [69]:
sections = ['summary', 'strengths_and_weaknesses', 'limitations']
metrics = ['subjectivity', 'polarity']

for section in sections:
    for metric in metrics:
        for tone in ["kind", "normal", "harsh"]:
            print(f"Section: {section}, Metric: {metric}", f"Tone: {tone}")

            t = df_sentiment.query(f'tone == "real" or tone == "{tone}"')[["number", 'review_id', 'tone', f"{section}_{metric}"]]
            t = t.rename(columns={f"{section}_{metric}": 'y'})
            t['section'] = section
            t = t.merge(df_word.query(f'section == "{section}"'), on=["number", 'review_id', 'tone', 'section'], how="left")
            
            t['x'] = t['tone'].apply(lambda x: 0 if x == 'real' else 1)

            md = smf.mixedlm("y ~ x", t, groups=t['number'])
            mdf = md.fit()
            print(mdf.summary()) 
            
            # extract coef and confidence interval
            params = mdf.params
            coef = params['x']
            conf_int = mdf.conf_int().loc['x'].values
            result['section'].append(section)
            if metric == 'subjectivity':
                result['metric'].append('subjectivity')
            else:
                result['metric'].append('sentiment')
            result['tone'].append(tone) 
            result['coef'].append(coef)
            result['conf_int'].append(conf_int)

Section: summary, Metric: subjectivity Tone: kind
       Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y     
No. Observations: 43      Method:             REML  
No. Groups:       10      Scale:              0.0386
Min. group size:  3       Log-Likelihood:     4.4292
Max. group size:  5       Converged:          Yes   
Mean group size:  4.3                               
----------------------------------------------------
            Coef. Std.Err.   z   P>|z| [0.025 0.975]
----------------------------------------------------
Intercept   0.373    0.038 9.723 0.000  0.298  0.448
x           0.012    0.071 0.174 0.862 -0.127  0.151
Group Var   0.003    0.032                          

Section: summary, Metric: subjectivity Tone: normal
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0266 
Min. gro


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0152 
Min. group size:  3       Log-Likelihood:     22.2391
Max. group size:  5       Converged:          Yes    
Mean group size:  4.3                                
-----------------------------------------------------
           Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept   0.090    0.027  3.355 0.001  0.038  0.143
x          -0.005    0.045 -0.116 0.908 -0.093  0.082
Group Var   0.003    0.024                           

Section: summary, Metric: polarity Tone: harsh
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0167 
Min. group size:  3       Log-Likelihood:     21.9


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 0.221759


The MLE may be on the boundary of the parameter space.


Random effects covariance is singular


The MLE may be on the boundary of the parameter space.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0051 
Min. group size:  3       Log-Likelihood:     47.0019
Max. group size:  5       Converged:          No     
Mean group size:  4.3                                
-----------------------------------------------------
           Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept   0.440    0.013 34.597 0.000  0.415  0.465
x          -0.060    0.026 -2.316 0.021 -0.111 -0.009
Group Var   0.000    0.063                           

Section: strengths_and_weaknesses, Metric: subjectivity Tone: normal
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0050 
Min. group size:  3       Lo


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


The Hessian matrix at the estimated parameter values is not positive definite.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0068 
Min. group size:  3       Log-Likelihood:     41.1321
Max. group size:  5       Converged:          Yes    
Mean group size:  4.3                                
-----------------------------------------------------
             Coef. Std.Err.   z   P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept    0.098    0.014 6.852 0.000  0.070  0.126
x            0.034    0.030 1.150 0.250 -0.024  0.093
Group Var    0.000                                   

Section: strengths_and_weaknesses, Metric: polarity Tone: normal
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0069 
Min. group size:  3       Log-Li


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


The Hessian matrix at the estimated parameter values is not positive definite.


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0068 
Min. group size:  3       Log-Likelihood:     41.2298
Max. group size:  5       Converged:          Yes    
Mean group size:  4.3                                
-----------------------------------------------------
             Coef. Std.Err.   z   P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept    0.098    0.015 6.612 0.000  0.069  0.127
x            0.067    0.030 2.245 0.025  0.009  0.126
Group Var    0.000    0.032                          

Section: limitations, Metric: subjectivity Tone: kind
       Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y     
No. Observations: 43      Method:             REML  
No. Groups:       10      Scale:              0.0360
Min. group size:  3       Log-Likelihood:     7


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The MLE may be on the boundary of the parameter space.


Random effects covariance is singular


The MLE may be on the boundary of the parameter space.



       Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y     
No. Observations: 43      Method:             REML  
No. Groups:       10      Scale:              0.0411
Min. group size:  3       Log-Likelihood:     4.3367
Max. group size:  5       Converged:          Yes   
Mean group size:  4.3                               
----------------------------------------------------
           Coef. Std.Err.   z    P>|z| [0.025 0.975]
----------------------------------------------------
Intercept  0.370    0.036 10.404 0.000  0.301  0.440
x          0.000    0.073  0.005 0.996 -0.143  0.144
Group Var  0.000    0.027                           

Section: limitations, Metric: subjectivity Tone: harsh
       Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y     
No. Observations: 43      Method:             REML  
No. Groups:       10      Scale:              0.0369
Min. group size:  3       Log-Likelihood:     6.5722
Max. g


Random effects covariance is singular


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.



        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0140 
Min. group size:  3       Log-Likelihood:     26.4220
Max. group size:  5       Converged:          No     
Mean group size:  4.3                                
-----------------------------------------------------
             Coef. Std.Err.   z   P>|z| [0.025 0.975]
-----------------------------------------------------
Intercept    0.086    0.021 4.136 0.000  0.045  0.127
x            0.009    0.043 0.213 0.832 -0.075  0.093
Group Var    0.000    0.013                          

Section: limitations, Metric: polarity Tone: harsh
        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y      
No. Observations: 43      Method:             REML   
No. Groups:       10      Scale:              0.0133 
Min. group size:  3       Log-Likelihood:     


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 0.201466


The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.



In [70]:
data = pd.DataFrame(result)
colors = px.colors.qualitative.Plotly

# plot line
for j, section in enumerate(sections):
    for i, metric in enumerate(['subjectivity', 'sentiment']):
        fig = go.Figure()
        t = data.query(f'section == "{section}" and metric == "{metric}"')
        fig.add_trace(go.Scatter(y=t['tone'], x=t['coef'], mode='markers', name=tone, error_x=dict(type='data', array=t['conf_int'].apply(lambda x: x[1]-x[0])/2), marker=dict(color=colors[i])))
        fig.update_layout(template='plotly_white', width=400, height=300, title=metric)
        fig.update_xaxes(range=[-0.2, 0.2], title=f"coef. (GPT - human)", zeroline=True, zerolinewidth=1, zerolinecolor='black')
        fig.update_yaxes(categoryorder="array", categoryarray=["harsh", "normal", "kind", ])
        fig.show()

In [75]:
# validity

df_anno = pd.read_excel("human_annotated.xlsx")

# check correlation
x1 = df_anno['human_subjectivity']
x2 = df_anno['human_polarity']
y1 = df_anno['computer_subjectivity']
y2 = df_anno['computer_polarity']

# spearman correlation
from scipy.stats import spearmanr
print(spearmanr(x1, y1))
print(spearmanr(x2, y2))


SignificanceResult(statistic=0.582615585816471, pvalue=0.07715007428966163)
SignificanceResult(statistic=0.6796711425041244, pvalue=0.030612470938029875)
