In [1]:
## Validity - ensuring that the measure actually captures what it claims to.
 # Face Validity - does the measure look valid at face level?
   # Is subjective but important
 # Content Validity - does the measure have the appropriate breadth?
   # Also subjective. Might consider other words that should be included or might consider removing some that don't belong.
 # Criterion Validity - does the variable correlate in ways that a good measure should?
   # It presumes that you have picked a good set of outcomes to correlate it with & that those are also measured validly.
   # This will be demonstrated below.
    

# Import/load packages to be used

import pandas as pd
data = pd.read_csv("datasets/validity.csv", index_col = 'Unnamed: 0')

In [2]:
# Briefly explore data

print(data.dtypes)

data.head()

sent        int64
WC          int64
rating      int64
purchase    int64
dtype: object


Unnamed: 0,sent,WC,rating,purchase
1,5,3,5,2
2,10,1,6,4
3,8,2,7,3
4,6,4,6,3
5,9,2,7,4


In [3]:
# Assess the correlations to see if the sentiment measure criterion valid.

corr_mat = data.corr().round(2)
corr_mat

Unnamed: 0,sent,WC,rating,purchase
sent,1.0,0.3,0.69,0.55
WC,0.3,1.0,0.15,0.05
rating,0.69,0.15,1.0,0.32
purchase,0.55,0.05,0.32,1.0


In [4]:
# Above - Sentiment variable correlates at 0.30 w/ word count (WC), 0.69 w/ product rating, & 0.55 w/ purchase likelihood.
 # Those correlations are larger than they are b/w other measures.
    # The rating variable is not correlating more strongly w/ the other variables.
    
# Compute the confidence intervals (CIs) of these correlation coefficients.
  # Remember from the association notebook...
  # 1) Transform correlation from initial space which we call r to a transformed space z, w/ a Normal distribution of errors.
  # 2) Compute the CI in the transformed space.
  # 3) Transform back to the original space.


import numpy as np
import scipy.stats as ss
import math

def r_z(r):
    return math.log((1 + r) / (1 - r)) / 2.0

def z_r(z):
    e = math.exp(2 * z)
    return((e - 1) / (e + 1))

def r_conf_int(r, alpha, n):
    # Transform r to z space
    z = r_z(r)
    # Compute standard error & critcal value in z
    se = 1.0 / math.sqrt(n - 3)
    z_crit = ss.norm.ppf(1 - alpha/ 2)

    ## Compute CIs w/ transform to r
    lo = z_r(z - z_crit * se)
    hi = z_r(z + z_crit * se)
    return (lo, hi)

def print_cis(corr_mat, var1, var2, idx1, idx2):
    print('\nFor ' + var1 + ' vs. ' + var2)
    conf_ints = r_conf_int(corr_mat[idx1, idx2], 0.05, 1000)
    print('Correlation = %4.3f with CI of %4.3f to %4.3f' % (corr_mat[idx1, idx2], conf_ints[0], conf_ints[1]))

corr_mat = np.array(corr_mat)

print_cis(corr_mat, 'sent', 'WC', 1, 0)
print_cis(corr_mat, 'sent', 'rating', 0, 2)
print_cis(corr_mat, 'sent', 'purchase', 0, 3)



For sent vs. WC
Correlation = 0.300 with CI of 0.243 to 0.355

For sent vs. rating
Correlation = 0.690 with CI of 0.656 to 0.721

For sent vs. purchase
Correlation = 0.550 with CI of 0.505 to 0.592


In [5]:
# Confidence intervals for correlation w/ sentiment are all small, appearing that all three correlation are significant.