In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle
import pandas as pd

def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def compare_dataframes_in_lists(list1, list2):
    # Check if the lists have the same length
    if len(list1) != len(list2):
        return False

    # Compare each corresponding DataFrame in the lists
    for df1, df2 in zip(list1, list2):
        if not isinstance(df1, pd.DataFrame) or not isinstance(df2, pd.DataFrame):
            return False
        if not df1.equals(df2):
            return False
    
    return True

def compare_pickles(file1, file2):
    # Load the objects from both pickle files
    obj1 = load_pickle(file1)
    obj2 = load_pickle(file2)

    # Ensure both objects are lists of DataFrames
    if isinstance(obj1, list) and isinstance(obj2, list):
        return compare_dataframes_in_lists(obj1, obj2)
    
    return False

In [3]:
file1 = '/Users/maheshbabu/Documents/IPV-Disagreements/data/processed/SNAP/Messaging Data/preprocessed_dfs.pkl'
file2 = '/Users/maheshbabu/Documents/IPV Project - SATH Lab/preprocessed_data/SNAP Study/all_participants/preprocessed_dfs.pkl'

are_equal = compare_pickles(file1, file2)
print("The pickle files contain identical data:", are_equal)

The pickle files contain identical data: True


In [4]:
# Example usage
file1 = '/Users/maheshbabu/Documents/IPV-Disagreements/data/processed/SOCIAL/Messaging Data/preprocessed_dfs.pkl'
file2 = '/Users/maheshbabu/Documents/IPV Project - SATH Lab/preprocessed_data/SOCIAL Study/all_participants/preprocessed_dfs.pkl'

are_equal = compare_pickles(file1, file2)
print("The pickle files contain identical data:", are_equal)

The pickle files contain identical data: True


In [5]:
# Example usage
file1 = '/Users/maheshbabu/Documents/IPV-Disagreements/data/processed/SNAP/EMA Data/preprocessed_ema.csv'
file2 = '/Users/maheshbabu/Documents/IPV Project - SATH Lab/preprocessed_data_ema/SNAP Study/all_participants/preprocessed_ema.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

are_equal = df1.equals(df2)
print("The csv files contain identical data:", are_equal)

The csv files contain identical data: True


In [6]:
# Example usage
file1 = '/Users/maheshbabu/Documents/IPV-Disagreements/data/processed/SOCIAL/EMA Data/preprocessed_ema.csv'
file2 = '/Users/maheshbabu/Documents/IPV Project - SATH Lab/preprocessed_data_ema/SOCIAL Study/all_participants/preprocessed_ema.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

are_equal = df1.equals(df2)
print("The csv files contain identical data:", are_equal)

The csv files contain identical data: True


In [8]:
from scripts.modeling_preparation.snap_modeling_prep import SnapProcessor
from scripts.modeling_preparation.social_modeling_prep import SocialProcessor
from scripts.config import SAVED_EMBEDDINGS_SNAP_DIR, PROCESSED_SNAP_MESSAGING_DIR, PROCESSED_SNAP_EMA_DIR
from scripts.config import SAVED_EMBEDDINGS_SOCIAL_DIR, PROCESSED_SOCIAL_MESSAGING_DIR, PROCESSED_SOCIAL_EMA_DIR

In [22]:
processor = SnapProcessor(
    model_name='distilbert-base-uncased', 
    relationship='Romantic Partner', 
    save_dir=SAVED_EMBEDDINGS_SNAP_DIR
)

snap_train_df, snap_test_df = processor.prepare_modeling_df(
    messaging_dir=PROCESSED_SNAP_MESSAGING_DIR, 
    ema_dir=PROCESSED_SNAP_EMA_DIR, 
    embeddings=False, 
    sentiment_analysis=True,
    return_train_test=True
)

Index(['Participant ID', 'Notification Time', 'Message_Action', 'Text',
       'Relationship', 'Type', 'Word_Count', 'Date', 'Time'],
      dtype='object')


In [9]:
processor = SocialProcessor(
    model_name='distilbert-base-uncased', 
    relationship='Romantic Partner', 
    save_dir=SAVED_EMBEDDINGS_SOCIAL_DIR
)

social_modeling_df = processor.prepare_modeling_df(
    messaging_dir=PROCESSED_SOCIAL_MESSAGING_DIR, 
    ema_dir=PROCESSED_SOCIAL_EMA_DIR, 
    embeddings=True, 
    sentiment_analysis=True
)

Processing batches: 100%|██████████| 3814/3814 [02:53<00:00, 21.96it/s]


In [10]:
social_modeling_df.head()

Unnamed: 0,Participant ID,Date,AvgResponseTime,AvgWordCount,Dim_0,Dim_1,Dim_2,Dim_3,Dim_4,Dim_5,...,Dim_760,Dim_761,Dim_762,Dim_763,Dim_764,Dim_765,Dim_766,Dim_767,NegativeSentimentPercentage,Disagreement
0,103,2020-02-19,914,4.444444,-0.113072,-0.003375,0.053374,-0.244417,-0.142045,-0.10197,...,0.079683,-0.099956,0.14979,-0.087321,-0.076694,-0.088578,0.222138,0.288298,0.222222,0
1,103,2020-02-20,-315,7.176471,-0.101816,0.030201,0.065782,-0.236305,-0.105142,-0.131367,...,0.065369,-0.118624,0.162341,-0.027412,-0.116137,-0.071079,0.275762,0.33343,0.088235,1
2,103,2020-02-21,-11252,12.148148,-0.055996,0.069645,0.026944,-0.225493,-0.137399,-0.124928,...,0.03918,-0.181462,0.130563,-0.018528,-0.101603,-0.094399,0.306199,0.298381,0.407407,0
3,103,2020-02-22,4103,6.944444,-0.10783,-0.009808,0.106004,-0.251404,-0.089122,-0.117409,...,0.060928,-0.130858,0.150742,-0.013144,-0.1178,-0.122147,0.274156,0.297791,0.111111,0
4,103,2020-02-23,-3133,7.130435,-0.043433,-0.001964,0.043961,-0.214035,-0.113714,-0.101207,...,0.077895,-0.120512,0.166863,-0.056596,-0.08362,-0.085471,0.250109,0.287579,0.152174,0


In [11]:
social_modeling_df['Disagreement'].value_counts(normalize=True)

Disagreement
0    0.757143
1    0.242857
Name: proportion, dtype: float64

In [24]:
snap_train_df.head()

Unnamed: 0,Participant ID,Date,AvgResponseTime,AvgWordCount,NegativeSentimentPercentage,Disagreement
230,1048,2021-05-15,1075,4.92623,0.07377,0
317,1086,2022-06-09,5210,2.787234,0.06383,0
316,1086,2022-06-08,656,3.111111,0.0,0
3,1030,2020-07-15,-476,3.591611,0.075055,0
281,1084,2022-05-04,3764,4.285714,0.047619,0


In [25]:
snap_test_df.head()

Unnamed: 0,Participant ID,Date,AvgResponseTime,AvgWordCount,NegativeSentimentPercentage,Disagreement
194,1046,2021-04-26,700,5.910448,0.179104,1
258,1065,2021-09-27,706,3.043478,0.028986,0
137,1042,2021-01-19,-60,6.183468,0.155242,1
358,1136,2023-06-14,18332,5.875,0.1875,1
323,1103,2022-09-30,216,4.976974,0.078947,0


In [16]:
snap_modeling_df.head()

Unnamed: 0,Participant ID,Date,AvgResponseTime,AvgWordCount,Dim_0,Dim_1,Dim_2,Dim_3,Dim_4,Dim_5,...,Dim_760,Dim_761,Dim_762,Dim_763,Dim_764,Dim_765,Dim_766,Dim_767,NegativeSentimentPercentage,Disagreement
0,1021,2020-03-19,-262,4.0,-0.111646,-0.006789,0.030004,-0.212288,-0.120487,-0.047895,...,0.042025,-0.091847,0.128726,-0.024542,-0.056725,-0.101384,0.218742,0.313225,0.066667,1
1,1021,2020-03-21,-24,5.833333,-0.11518,0.033125,0.043796,-0.255073,-0.159728,0.009796,...,0.040946,-0.046417,0.180149,-0.037716,-0.074713,-0.129381,0.28377,0.252939,0.166667,0
2,1024,2020-03-04,83,3.971549,-0.113479,-0.029992,0.059973,-0.201995,-0.1136,-0.100916,...,0.095793,-0.077714,0.159703,-0.057345,-0.069986,-0.076425,0.209191,0.30055,0.048472,0
3,1030,2020-07-15,-476,3.591611,-0.136828,-0.026637,0.054273,-0.211849,-0.105161,-0.084979,...,0.092372,-0.077779,0.14873,-0.066659,-0.087728,-0.075931,0.208917,0.288417,0.075055,0
4,1030,2020-07-23,16,4.82866,-0.101195,-0.003865,0.060046,-0.196323,-0.084259,-0.091046,...,0.089443,-0.0779,0.156399,-0.06022,-0.095593,-0.068311,0.195183,0.277352,0.070613,1


In [10]:
snap_modeling_df['Disagreement'].value_counts(normalize=True)

Disagreement
0    0.780127
1    0.219873
Name: proportion, dtype: float64

In [11]:
snap_modeling_df_test = pd.read_csv('/Users/maheshbabu/Documents/IPV Project - SATH Lab/preprocessed_data/SNAP Study/all_participants/snap_modeling_df.csv',
                                    parse_dates=['Date'])

In [12]:
snap_modeling_df_test.head()

Unnamed: 0,Participant ID,Date,AvgResponseTime,AvgWordCount,Dim_0,Dim_1,Dim_2,Dim_3,Dim_4,Dim_5,...,Dim_759,Dim_760,Dim_761,Dim_762,Dim_763,Dim_764,Dim_765,Dim_766,Dim_767,Disagreement
0,1021,2020-03-19,-262,4.0,-0.111646,-0.00679,0.030004,-0.212288,-0.120487,-0.047895,...,-0.206069,0.042025,-0.091847,0.128726,-0.024542,-0.056725,-0.101384,0.218742,0.313225,1
1,1021,2020-03-21,-24,5.833333,-0.11518,0.033125,0.043796,-0.255073,-0.159728,0.009796,...,-0.182156,0.040946,-0.046417,0.180149,-0.037716,-0.074713,-0.129381,0.28377,0.252939,0
2,1024,2020-03-04,83,3.971549,-0.113479,-0.029992,0.059973,-0.201995,-0.1136,-0.100916,...,-0.183716,0.095793,-0.077713,0.159703,-0.057345,-0.069986,-0.076425,0.209191,0.30055,0
3,1030,2020-07-15,-476,3.591611,-0.136828,-0.026637,0.054273,-0.211849,-0.105161,-0.084979,...,-0.199268,0.092372,-0.077779,0.14873,-0.066659,-0.087728,-0.075931,0.208917,0.288418,0
4,1030,2020-07-23,16,4.82866,-0.101195,-0.003865,0.060046,-0.196323,-0.084259,-0.091046,...,-0.218308,0.089442,-0.0779,0.156399,-0.06022,-0.095593,-0.068311,0.195183,0.277352,1


In [13]:
snap_modeling_df.equals(snap_modeling_df_test)

False

In [14]:
snap_modeling_df_test.dtypes

Participant ID              int64
Date               datetime64[ns]
AvgResponseTime             int64
AvgWordCount              float64
Dim_0                     float64
                        ...      
Dim_764                   float64
Dim_765                   float64
Dim_766                   float64
Dim_767                   float64
Disagreement                int64
Length: 773, dtype: object

In [15]:
snap_modeling_df.dtypes

Participant ID                          int64
Date                           datetime64[ns]
AvgResponseTime                         int64
AvgWordCount                          float64
Dim_0                                 float64
                                    ...      
Dim_765                               float64
Dim_766                               float64
Dim_767                               float64
NegativeSentimentPercentage           float64
Disagreement                            int64
Length: 774, dtype: object