In [1]:
import pandas as pd
from SentiCR.SentiCR import SentiCR

SentiCR/data/so-train.pkl


## Preprocessing

In [2]:
df_original = pd.read_xml('test.xml')

In [3]:
df = df_original[4:].drop(['team_domain', 'channel_name', 'start_date', 'end_date'], axis=1)

### SentiCR preprocessing

In [4]:
import numpy as np
from sklearn.metrics import classification_report
import time
sentiment_analyzer=SentiCR()
def predict_sentiCR(df, column = 'first_message'):
    begin=time.time()
    sentences=df[column]
    pred=[]
    for sent in sentences:
        score=sentiment_analyzer.get_sentiment_polarity(sent)
        pred.append(score)
    
    end=time.time()
    print('Prediction used {:.2f} seconds'.format(end-begin))
    return pred

StackOverflow Reading data from oracle..
Training classifier model..


  % sorted(inconsistent)


Training used 8.74 seconds


In [5]:
from os.path import exists
sentiCR_path = 'preprocessed_sentiCR.csv'

if exists(sentiCR_path):
    df_sentiCR = pd.read_csv(sentiCR_path)
else:
    pred_full = predict_sentiCR(df, 'text')
    df_sentiCR = df.copy()
    df_sentiCR['sentiCR'] = Extract(pred_full)
    df_sentiCR['sentiCR'] = df_sentiCR['sentiCR'].replace({2: -1})
    df_sentiCR.to_csv(sentiCR_path)

### Senti4SD preprocessing
For the preprocessing of Senti4SD we do the sentiment score calculation outside this notebook file. Make sure yuo save the csv to predictions_senti4SD.csv

In [25]:
df_senti4SD = df.copy()
df_senti4SD = df_senti4SD.drop(['ts', 'conversation_id', 'user'], axis=1).rename(columns={"text": "Text"})
df.index.name = 'ID'
print('Use this dataframe to feed to the senti4SD repo')
df_senti4SD.to_csv('df_senti4SD.csv',sep=';')
df_senti4SD.head()

Use this dataframe to feed to the senti4SD repo


Unnamed: 0_level_0,Text
ID,Unnamed: 1_level_1
4,{{ with .Data }} also works
5,if .Data is invalid it won't be rendered
6,ah
7,true
8,okay okay thank you


In [7]:
senti4SD = pd.read_csv('predictions_senti4SD.csv').sort_values(by=['ID']).set_index('ID')
senti4SD['PREDICTED'] = senti4SD['PREDICTED'].replace({'negative': -1, 'neutral': 0, 'positive': 1})

In [8]:
senti4SD_df = df.copy()
senti4SD_df['senti4SD'] = senti4SD['PREDICTED'].copy()
senti4SD_df.to_csv('preprocessed_senti4SD.csv')
senti4SD_df.head()

Unnamed: 0_level_0,conversation_id,ts,user,text,senti4SD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,1.0,2020-08-01T00:01:15.595000,Samarie,{{ with .Data }} also works,0
5,1.0,2020-08-01T00:01:22.227000,Samarie,if .Data is invalid it won't be rendered,0
6,1.0,2020-08-01T00:02:50.586000,Cephas,ah,0
7,1.0,2020-08-01T00:02:51.116000,Cephas,true,0
8,1.0,2020-08-01T00:02:55.357000,Cephas,okay okay thank you,1


### Calculating the number of messages per conversation

In [44]:
response_df = df['conversation_id'].value_counts().rename_axis('conversation_id').to_frame('nr_messages').sort_index(ascending=True)
response_df.head()

Unnamed: 0_level_0,nr_messages
conversation_id,Unnamed: 1_level_1
1.0,7
2.0,7
3.0,2
4.0,85
5.0,1


# SentiCR correlations
##  Correlation between nr of messages and the sentiment of first message in the conversation

In [10]:
first_message_df = df.sort_values(by=['ts']).groupby("conversation_id").first()
count_and_text_df = response_df.join(first_message_df)\
.rename(columns={"text": "first_message", "user": "user_first_message", "ts": "ts_first_message"})

In [11]:
count_and_text_df.head()

Unnamed: 0_level_0,nr_messages,ts_first_message,user_first_message,first_message
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,7,2020-08-01T00:01:15.595000,Samarie,{{ with .Data }} also works
2.0,7,2020-08-01T01:09:01.202000,Reeves,Hwllo
3.0,2,2020-08-01T01:27:32.834000,Treysen,The built in proxy stuff is nice
4.0,85,2020-08-01T02:00:57.155000,Josselin,probably
5.0,1,2020-08-01T03:17:19.489000,Maiana,i've codegened > 600 go files that all wrap in...


In [12]:
print("Predicting sentiment with sentiCR on first message of every conversation")
predictions = predict_sentiCR(count_and_text_df)
def Extract(lst):
    return [item[0] for item in lst]
df_with_sentiment = count_and_text_df.copy()
df_with_sentiment['sentiCR'] = Extract(predictions)
# Replace 2 with -1 as 2 is negative according to this model
df_with_sentiment['sentiCR'] = df_with_sentiment['sentiCR'].replace({2: -1})
df_with_sentiment.head()

Predicting sentiment with sentiCR on first message of every conversation
Prediction used 4.19 seconds


Unnamed: 0_level_0,nr_messages,ts_first_message,user_first_message,first_message,sentiCR
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,7,2020-08-01T00:01:15.595000,Samarie,{{ with .Data }} also works,1
2.0,7,2020-08-01T01:09:01.202000,Reeves,Hwllo,0
3.0,2,2020-08-01T01:27:32.834000,Treysen,The built in proxy stuff is nice,0
4.0,85,2020-08-01T02:00:57.155000,Josselin,probably,0
5.0,1,2020-08-01T03:17:19.489000,Maiana,i've codegened > 600 go files that all wrap in...,0


In [13]:
corr=df_with_sentiment['nr_messages'].corr(df_with_sentiment['sentiCR'])
print(f"The correlation between the nr messages inside a conversation \nand the sentiCR score of the first message is {corr}")

The correlation between the nr messages inside a conversation 
and the sentiCR score of the first message is -0.0455387783444905


## Correlation between avg sentiment and nr of messages in conversation

In [16]:
avg_senti_df = df_sentiCR.copy()
avg_senti_df['sentiCR'] += 2 # Plus two otherwise one 0 in the conversation will directly lead to a mean of 0 
avg_senti_df = avg_senti_df.groupby('conversation_id').mean()
avg_senti_df['nr_messages'] = response_df
avg_senti_df

Unnamed: 0_level_0,ID,sentiCR,nr_messages
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,7.000000,2.000000,7
2.0,14.000000,1.857143,7
3.0,18.500000,2.000000,2
4.0,63.529412,1.905882,85
5.0,33.000000,2.000000,1
...,...,...,...
1911.0,59922.000000,1.000000,1
1912.0,60023.500000,2.000000,4
1913.0,60034.500000,1.916667,12
1914.0,60054.076923,2.038462,26


In [17]:
corr=avg_senti_df['nr_messages'].corr(avg_senti_df['sentiCR'])
corr

-0.013144482517522031

# Senti4SD correlations
##  Correlation between nr of messages and the sentiment of first message in the conversation

In [23]:
first_msg_senti4SD = senti4SD_df.sort_values(by=['ts']).groupby("conversation_id").first()
SD_conv_text_df = response_df.join(first_msg_senti4SD)\
.rename(columns={"text": "first_message", "user": "user_first_message", "ts": "ts_first_message"})
SD_conv_text_df.head()

Unnamed: 0_level_0,nr_messages,ts_first_message,user_first_message,first_message,senti4SD
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,7,2020-08-01T00:01:15.595000,Samarie,{{ with .Data }} also works,0
2.0,7,2020-08-01T01:09:01.202000,Reeves,Hwllo,0
3.0,2,2020-08-01T01:27:32.834000,Treysen,The built in proxy stuff is nice,1
4.0,85,2020-08-01T02:00:57.155000,Josselin,probably,0
5.0,1,2020-08-01T03:17:19.489000,Maiana,i've codegened > 600 go files that all wrap in...,-1


In [24]:
corr=SD_conv_text_df['nr_messages'].corr(SD_conv_text_df['senti4SD'])
print(f"The correlation between the nr messages inside a conversation \nand the senti4SD score of the first message is {corr}")

The correlation between the nr messages inside a conversation 
and the senti4SD score of the first message is -0.02342402019049213


## Correlation between avg sentiment and nr of messages in conversation

In [21]:
avg_4sd_df = senti4SD_df.copy()
avg_4sd_df['senti4SD'] += 2 # Plus two otherwise one 0 in the conversation will directly lead to a mean of 0 
avg_4sd_df = avg_4sd_df.groupby('conversation_id').mean()
avg_4sd_df['nr_messages'] = response_df
avg_4sd_df.head()

Unnamed: 0_level_0,senti4SD,nr_messages
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2.285714,7
2.0,2.142857,7
3.0,3.0,2
4.0,2.023529,85
5.0,1.0,1


In [22]:
corr=avg_4sd_df['nr_messages'].corr(avg_4sd_df['senti4SD'])
print(f"The correlation between the average senti4SD and the number of messages in the conversation is {corr}")

The correlation between the average senti4SD and the number of messages in the conversation is -0.02096483351462585


# Per question

In [30]:
questions = df_sentiCR["text"].str.contains("?", regex=False).astype(int)
print("Showing only questions within the sentiCR df:")
df_sentiCR_questions = df_sentiCR[questions == 1]
df_sentiCR_questions.head()

Showing only questions within the sentiCR df:


Unnamed: 0,ID,conversation_id,ts,user,text,sentiCR
9,13,2.0,2020-08-01T01:09:29.986000,Reeves,I am finding imposible chaining proxies with n...,0
29,33,5.0,2020-08-01T03:17:19.489000,Maiana,i've codegened > 600 go files that all wrap in...,0
47,51,4.0,2020-08-01T03:40:44.702000,Maiana,@_diamondburned_ do you think squeezing the go...,0
48,52,6.0,2020-08-01T03:41:52.495000,Cephas,im displaying this in a HTML where there\'s a ...,0
49,53,4.0,2020-08-01T03:42:55.236000,Cephas,(the struct has no ID and some data pointers a...,0


In [58]:
response_df_reset = response_df.reset_index()
# df_sentiCR_questions.join(response_df_reset, on='conversation_id')
question_sentiCR_df = df_sentiCR_questions.merge(response_df_reset, how='inner', on='conversation_id')

In [59]:
corr=question_sentiCR_df['nr_messages'].corr(question_sentiCR_df['sentiCR'])
print(f"The correlation between a questions sentiCR and the number of messages in the conversation is {corr}")

The correlation between a questions sentiCR and the number of messages in the conversation is 0.013647948199097187
