In [84]:
from scipy.stats import pearsonr
import pandas as pd

## Loading preprocessed data with sentiment scores

In [53]:
df_senti4sd_racket = pd.read_csv('predictions_senti4SD_racket.csv', index_col="ID");
df_senti4sd_closure = pd.read_csv('predictions_senti4SD_closure.csv', index_col="ID");
df_senti4sd_python = pd.read_csv('predictions_senti4SD_python.csv', index_col="ID");
df_senti4sd_golang = pd.read_csv('predictions_senti4SD_golang.csv', index_col="ID");

In [54]:
df_sentiCR_racket = pd.read_csv('predictions_sentiCR_racket.csv', index_col="ID");
df_sentiCR_closure = pd.read_csv('predictions_sentiCR_closure.csv', index_col="ID");
df_sentiCR_python = pd.read_csv('predictions_sentiCR_python.csv', index_col="ID");
df_sentiCR_golang = pd.read_csv('predictions_sentiCR_golang.csv', index_col="ID");

In [60]:
df_racket = pd.read_csv('full_racket_df.csv', index_col="Unnamed: 0");
df_racket['sentiCR'] = df_sentiCR_racket['sentiCR']
df_racket['senti4SD'] = df_senti4sd_racket['PREDICTED'].replace({'neutral': 0, 'negative': -1, 'positive': 1})
df_racket.to_csv('df_racket_sentiment.csv')

df_clojure = pd.read_csv('full_clojur_df.csv', index_col="Unnamed: 0");
df_clojure['sentiCR'] = df_sentiCR_closure['sentiCR']
df_clojure['senti4SD'] = df_senti4sd_closure['PREDICTED'].replace({'neutral': 0, 'negative': -1, 'positive': 1})
df_clojure.to_csv('df_clojure_sentiment.csv')

df_python = pd.read_csv('full_python_df.csv', index_col="Unnamed: 0");
df_python['sentiCR'] = df_sentiCR_python['sentiCR']
df_python['senti4SD'] = df_senti4sd_python['PREDICTED'].replace({'neutral': 0, 'negative': -1, 'positive': 1})
df_python.to_csv('df_python_sentiment.csv')

df_golang = pd.read_csv('full_golang_df.csv', index_col="Unnamed: 0");
df_golang['sentiCR'] = df_sentiCR_golang['sentiCR']
df_golang['senti4SD'] = df_senti4sd_golang['PREDICTED'].replace({'neutral': 0, 'negative': -1, 'positive': 1})
df_golang.to_csv('df_golang_sentiment.csv')

In [59]:
df_sentiCR_racket

Unnamed: 0_level_0,Text,sentiCR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Game Jam has started! https://itch.io/jam/rack...,0
2,@notjack @Deleted User `Multiple modules can b...,-1
3,Honestly I'm so proud that I remembered all th...,-1
4,@notjack for when you're next around. Will try...,0
5,gotta love macros xD,0
...,...,...
4971,(It certainly has technical challenges),0
4972,Can I suggest an alternative? Set up a github ...,0
4973,oh that's cool idea,1
4974,I can't recall one. would be fun. 2htdp/univer...,-1


# Sentiment correlations
##  Correlation between nr of messages and the sentiment of first message in the conversation

In [85]:
def get_nr_message_conv_df(df):
    conversation_count = df['conversation_id'].value_counts()\
     .rename_axis('conversation_id').to_frame('nr_messages').sort_index(ascending=True)

    first_message_df = df.sort_values(by=['ts']).groupby("conversation_id").first()
    return response_df_golang.join(first_message_df)\
    .rename(columns={"text": "first_message", "user": "user_first_message", "ts": "ts_first_message"})

def calculate_pvalues(df):
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

In [62]:
first_msg_golang_df = get_nr_message_conv_df(df_golang)
first_msg_python_df = get_nr_message_conv_df(df_python)
first_msg_clojure_df = get_nr_message_conv_df(df_clojure)
first_msg_racket_df = get_nr_message_conv_df(df_racket)

Unnamed: 0_level_0,nr_messages,ts_first_message,user_first_message,first_message,sentiCR,senti4SD
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,80,2019-11-01T04:49:13.533000,Naif,`2019/11/01 15:41:41 http: panic serving 127.0...,-1,-1
2.0,19,2019-11-01T04:58:50.798000,Leo,oh boy what do we have here,0,0
3.0,56,2019-11-01T05:18:06.297000,Piya,jesus christ global variables,0,0
4.0,207,2019-11-01T05:49:05.209000,Leo,single quote?,0,0
5.0,23,2019-11-01T06:06:15.482000,Lathan,Should I have my struct fields as pointers? Fe...,0,0
...,...,...,...,...,...,...
2618.0,89,2020-07-31T20:17:04.704000,Suhar,would having an `interface{}` as a db driver a...,0,0
2619.0,10,2020-07-31T20:44:31.662000,Suhar,although i think withreadonly would work better,1,0
2620.0,11,2020-07-31T22:26:03.468000,Abriella,I'm trying to make a switch that goes off ever...,1,0
2621.0,2,2020-07-31T23:21:35.024000,Normandy,anyone here who used firestore before?,0,0


In [93]:
def print_corr_first_message(df, dataset=''):
    corr=df['nr_messages'].corr(df['sentiCR'])
    corrSenti4SD=df['nr_messages'].corr(df['senti4SD'])
    pvalue= calculate_pvalues(df[['nr_messages','sentiCR']])['sentiCR']['nr_messages']
    print(f"The correlation between the nr messages inside a conversation of {dataset} \n"\
    f"and the sentiCR score of the first message is {corr}\n"\
         f"and with the senti4SD score {corrSenti4SD} "\
         f"with p-value {pvalue} \n")

In [94]:
print_corr_first_message(first_msg_golang_df, 'golang')
print_corr_first_message(first_msg_python_df, 'python')
print_corr_first_message(first_msg_clojure_df, 'clojure')
print_corr_first_message(first_msg_racket_df, 'racket')

The correlation between the nr messages inside a conversation of golang 
and the sentiCR score of the first message is 0.0018923772564124464
and with the senti4SD score -0.0036025482150494697 with p-value 0.9228 

The correlation between the nr messages inside a conversation of python 
and the sentiCR score of the first message is 0.01065717958805894
and with the senti4SD score -0.0053083425537289826 with p-value 0.6078 

The correlation between the nr messages inside a conversation of clojure 
and the sentiCR score of the first message is 0.1497640326746519
and with the senti4SD score 0.06500917507327052 with p-value 0.2992 

The correlation between the nr messages inside a conversation of racket 
and the sentiCR score of the first message is 0.07352123639365025
and with the senti4SD score -0.03746318219554819 with p-value 0.3045 



## Correlation between avg sentiment and nr of messages in conversation

In [102]:
def calculate_avg_sentiment(df):    
    conversation_count = df['conversation_id'].value_counts()\
     .rename_axis('conversation_id').to_frame('nr_messages').sort_index(ascending=True)

    avg_senti_df = df[['conversation_id', 'sentiCR', 'senti4SD']].copy().groupby('conversation_id').mean()
    avg_senti_df['nr_messages'] = conversation_count
    return avg_senti_df

avg_sentiment_racket = calculate_avg_sentiment(df_racket)
avg_sentiment_clojure = calculate_avg_sentiment(df_clojure)
avg_sentiment_python = calculate_avg_sentiment(df_python)
avg_sentiment_golang = calculate_avg_sentiment(df_golang)

In [103]:
print_corr_first_message(avg_sentiment_golang, 'golang')
print_corr_first_message(avg_sentiment_python, 'python')
print_corr_first_message(avg_sentiment_clojure, 'clojure')
print_corr_first_message(avg_sentiment_racket, 'racket')

The correlation between the nr messages inside a conversation of golang 
and the sentiCR score of the first message is 0.005365340094997646
and with the senti4SD score -0.029355069143320443 with p-value 0.7836 

The correlation between the nr messages inside a conversation of python 
and the sentiCR score of the first message is -0.03912172427340643
and with the senti4SD score -0.12279663930776336 with p-value 0.0595 

The correlation between the nr messages inside a conversation of clojure 
and the sentiCR score of the first message is -0.0962683321804972
and with the senti4SD score -0.09479712861698329 with p-value 0.506 

The correlation between the nr messages inside a conversation of racket 
and the sentiCR score of the first message is 0.0428193119869374
and with the senti4SD score -0.0597330696019449 with p-value 0.5502 

