#### This notebook contains all the code to pre-process and pickle the inputs for model_1 and model_2 to use

#### A new user would run these to generate all the pickle files

#### git clone --recurse-submodules https://github.com/PriyaDCosta/coefficientofconflict


In [2]:
from conflict_utils import *

In [3]:
#Load the complete raw datasets :REPLACE WITH YOUR OWN PATHS!!!
winning_df = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/conflict_reddit_data/full_data/winning_conversations.csv')
awry_df = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/conflict_reddit_data/full_data/conversations_gone_awry.csv')

#Hand labled dataset
data = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/CONFLICT_CONVO_LABELING_LOG.csv')

In [4]:
#Get the dataset
dataset_labels(data,on_column='CONV_ID')

#Convert the text labels to numeric labels
convert_labels(data)

#Get the average rating for each chat
numeric_cols = ['d_content', 'd_expression', 'oi_content', 'oi_expression']
data = average_labels(data,numeric_cols,'CONV_ID')

In [5]:
#Human labeled data with text, other data formatted to generate synthetic labels
hand_labeled, non_hand_labeled = merge_raw_data(data,winning_df,awry_df)

CONV_ID                                                          d0vwxn6
text                   &gt;How are you equating those two?\n\nDiversi...
speaker                                                  ICouldBeAKiller
id                                                               d0whl51
timestamp                                                   1457733716.0
meta.score                                                           1.0
reply_to                                                         d0vyjdq
conversation_length                                                    4
dataset_numeric                                                        1
Name: 61152, dtype: object
&gt;How are you equating those two?

Diversity means more people. More people equals more children. More children... Younger workforce...
Index(['CONV_ID', 'id', 'rating_directness_content',
       'rating_directness_expression', 'rating_OI_content',
       'rating_OI_expression', 'rater_id', 'status', 'last_update

In [6]:
#Format for hand TPM
tpm_hand_labeled = format_for_tpm(hand_labeled)
tpm_non_hand_labeled = format_for_tpm(non_hand_labeled)

30
6851


In [7]:
# Generate BERT Embeddings,reduce dimensions and pickle
bert_embeddings_non_hand_labeled = generate_sbert_embeddings(non_hand_labeled,'text',n_components=4)
bert_embeddings_hand_labeled = generate_sbert_embeddings(hand_labeled,'text',n_components=4)

#Pickle the embeddings
pickle_embeddings(bert_embeddings_non_hand_labeled,'embeddings/initial_inputs/embeddings_non_hand_labeled.pickle')
pickle_embeddings(bert_embeddings_hand_labeled,'embeddings/initial_inputs/embeddings_hand_labeled.pickle')# 

In [8]:
tpm_non_hand_labeled.to_csv('csv/tpm_non_hand_labeled.csv')
tpm_hand_labeled.to_csv('csv/tpm_hand_labeled.csv')

In [9]:
tpm_hand_labeled['word_counts'] = tpm_hand_labeled['message'].str.split().apply(len)
print(tpm_hand_labeled['word_counts'].max())

tpm_non_hand_labeled['word_counts'] = tpm_non_hand_labeled['message'].str.split().apply(len)
print(tpm_non_hand_labeled['word_counts'].max())

196
200


In [10]:
print(len(tpm_hand_labeled))
print(len(tpm_non_hand_labeled))

418
76889


In [11]:
hand_labeled_ids = tpm_hand_labeled['conversation_num'].unique() 
non_hand_labeled_ids = tpm_non_hand_labeled['conversation_num'].unique() 

print("Hand labeled IDs:", len(hand_labeled_ids))
print("Non-hand labeled IDs:", len(non_hand_labeled_ids))

# Convert lists to sets and find the intersection
common_elements = set(hand_labeled_ids).intersection(set(non_hand_labeled_ids))

# Get the number of similar elements
number_of_similar_elements = len(common_elements)

print("Number of common IDs:", number_of_similar_elements)

Hand labeled IDs: 48
Non-hand labeled IDs: 8436
Number of common IDs: 0


In [14]:
import sys
sys.path.insert(0,'../team-process-map/feature_engine')
from feature_builder import FeatureBuilder

feature_builder_hand_labeled = FeatureBuilder(
    input_file_path = '../sandbox/csv/tpm_hand_labeled.csv',
    vector_directory = "feature_engine_outputs/vector_data/",
    output_file_path_chat_level = "../sandbox/csv/tpm_hand_labeled_chat_features.csv",
    output_file_path_user_level = "../sandbox/csv/tpm_hand_labeled_user_labeled.csv",
    output_file_path_conv_level = "../sandbox/csv/tpm_hand_labeled_conv_labeled.csv",
    turns = False,
)

feature_builder_hand_labeled.featurize(col='text')
tpm_hand_labeled = pickle_embeddings(pd.read_csv("../sandbox/csv/tpm_hand_labeled_chat_features.csv"),"../sandbox/embeddings/initial_inputs/tpm_hand_labeled.pickle")

Initializing Featurization for ../sandbox/csv/tpm_hand_labeled.csv ...
Confirmed that data has `conversation_num`, `message`, and `speaker_nickname` columns!
Chat Level Features ...
Generating features for the first 100.0% of messages...
All Done!


In [15]:
#Generate TPM features and pickle
feature_builder_non_hand_labeled = FeatureBuilder(
    input_file_path = "../sandbox/csv/tpm_non_hand_labeled.csv",
    vector_directory = "feature_engine_outputs/vector_data/",
    output_file_path_chat_level = "../sandbox/csv/tpm_non_hand_labeled_chat_features.csv",
    output_file_path_user_level = "../sandbox/csv/tpm_non_hand_labeled_user_labeled.csv",
    output_file_path_conv_level = "../sandbox/csv/tpm_non_hand_labeled_conv_labeled.csv",
    turns = False,
)

feature_builder_non_hand_labeled.featurize(col='text')
tpm_hand_labeled = pickle_embeddings(pd.read_csv("../sandbox/csv/tpm_non_hand_labeled_chat_features.csv"),"../sandbox/embeddings/initial_inputs/tpm_non_hand_labeled.pickle")

Initializing Featurization for ../sandbox/csv/tpm_non_hand_labeled.csv ...
Confirmed that data has `conversation_num`, `message`, and `speaker_nickname` columns!
Chat Level Features ...
Generating features for the first 100.0% of messages...
All Done!
