In [None]:
%load_ext autoreload
%autoreload 2

In [None]:

from hydra import initialize, initialize_config_module, initialize_config_dir, compose

from src.item_processing import  *
from src.utils.stats_utils import *
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:
features_class = ItemFeatureProcessing(config)

In [None]:
df_item = features_class.df_item
self = features_class

# Answer Selected ITEM Level Processing

In [None]:
#pivot_table, index_col = self.get_clean_pivot_table('f__answer_selected',remove_low_freq_col=True)
feature_name = 'f__answer_selected'
score_name = self.rename_feature(feature_name)
df = self.df_item[~pd.isnull(self.df_item[feature_name])].copy()
# Select only those variables that have at least three distinct values and more than one hundred records
valid_variables = self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)


Identify anomalies in the percentage of selected answers and plot them. It Uses ECOD as we are interested in detecting unusual high or low number of item selections

In [None]:
score_name1 = score_name + '_lower'      
score_name2 = score_name + '_upper'
for var in valid_variables:
    mask = (df['variable_name'] == var)
    contamination = self.get_contamination_parameter(feature_name, method='medfilt', random_state=42)    
    model = ECOD(contamination=0.11)
    model.fit(df[mask][[feature_name]])


    df.loc[mask, score_name] = model.predict(df[mask][[feature_name]])
    
    min_good_value = df[(df[score_name]==0) & mask][feature_name].min()
    max_good_value = df[(df[score_name]==0) & mask][feature_name].max()
    
    df.loc[mask, score_name1] = 0 
    df.loc[mask, score_name2] = 0     

    df.loc[mask & (df[mask][feature_name] < min_good_value), score_name1] = 1 
    df.loc[mask & (df[mask][feature_name] > max_good_value), score_name2] = 1 
    
    
    bins = np.histogram_bin_edges(df[mask][feature_name], bins=48)
    data_true = df[(df[score_name]==0) & mask][feature_name]
    data_lower = df[(df[score_name1]==1) & mask][feature_name]
    data_upper = df[(df[score_name2]==1) & mask][feature_name]
    
    plt.hist(data_true, bins=bins, alpha=0.5, color='blue', label='True')
    plt.hist(data_lower, bins=bins, alpha=0.5, color='red', label='False')
    plt.hist(data_upper, bins=bins, alpha=0.5, color='orange', label='False')
    plt.title(var)
    plt.show()
    
    df.drop(columns=[score_name], inplace=True)

Plot the box plot

In [None]:
for index_range in range(0, len(valid_variables), 50):
    variables = valid_variables[index_range:index_range+50]
    plt.figure(figsize=(15, 6))
    sns.boxplot(df[df['variable_name'].isin(variables)],x='variable_name', y=feature_name, hue=score_name1)
    plt.xticks(rotation=90)
    plt.show()

# Answer Selected UNIT Level Processing


In [None]:
data = df.groupby(['interview__id']).agg({score_name1: 'mean', score_name2:'mean'})
data = data.reset_index()
data[score_name1].hist()
plt.title(score_name1)
plt.show()
data[score_name2].hist()
plt.title(score_name2)
plt.show()

In [None]:
total_unit = data['interview__id'].count()
mean_value1 = data[score_name1].mean()
mean_value2 = data[score_name2].mean()
print(f" Total UNITS: {total_unit}, with an average of lower anomalies in selected items {mean_value1} and upper one {mean_value2}")

In [None]:
data = df.groupby(['interview__id','responsible']).agg({score_name1: 'mean', score_name2:'mean'})
data = data.reset_index()

resp_df = {}
for resp in data['responsible'].unique():
    mask = (data['responsible']==resp)

    total_unit = data[mask]['interview__id'].count()
    mean_value1 = data[mask][score_name1].mean()
    mean_value2 = data[mask][score_name2].mean()
    resp_df[resp] = [mean_value1, mean_value2]
    print(f"{resp} - Total UNITS: {total_unit}, with an average of lower anomalies in selected items {mean_value1} and upper one {mean_value2}")

resp_df = pd.DataFrame.from_records(resp_df).T
resp_df = resp_df.reset_index()
resp_df.columns = ['responsible', 'mean_value1', 'mean_value2']
resp_df.set_index('responsible')['mean_value1'].plot(kind='bar')
plt.title(score_name1)
plt.show()
resp_df.set_index('responsible')['mean_value2'].plot(kind='bar')
plt.title(score_name2)
plt.show()