In [1]:
#education level, age, household income, race, (metropolitan or non-metropolitan)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate 
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings("ignore")

In [3]:
def computeSum(df):
    # Choosing questions that have the keyword 'privacy' in them and have binary responses
    privacy_colns = ["ptdtrace", "hefaminc","peeduca", "gtmetsta","prtage", "hepspre1", "hepspre2", "hepspre3", "hepspre4", "hepspre5", "hepscon1", "hepscon2", "hepscon3", "hepscon4", "hepscon5", "hepscon6", "hepscon8", "henohm7"]#, "heprinoh"]

    # Original: -1 = blank, 1 = yes, 2 = no
    # Adjusted: -1 = no, 0 = blank, 1 = yes
    df = df[privacy_colns]
    dict_values = {-1:0, 2:0}
    df = df.replace(dict_values)
    # Var set is the base questions for all subquestions
    var_set = set()

    # Gets the base question (e.g. hepspre is a base question of hepspre1) string by removing digits of unique column names and append them to set
    for col in privacy_colns:
        non_digit_q = ''.join(c for c in col if not c.isdigit())
        var_set.add(non_digit_q)

    # Calculates the sums for each row based on the base question and adds the result as a column to the dataframe
    for var in var_set:
        cols_to_sum = [col for col in df.columns if var in col]
        df[f'{var}_sum'] = df.loc[:, cols_to_sum].sum(axis=1)
        
    return df
def computeAvg(df):
    cols_to_avg = [col for col in df.columns if "sum" in col]
    avg_vals = []
    for col in cols_to_avg:
        avg = sum(df[col]) / len(df[col])
        avg_vals.append(avg)
#         display(col + " average: " + str(avg))
    return avg_vals[:3]

In [4]:
df_21y = pd.read_csv("nov21-cps.csv")
df_21y = computeSum(df_21y)

df_19y = pd.read_csv("nov19-cps.csv")
df_19y = computeSum(df_19y)

df_17y = pd.read_csv("nov17-cps.csv")
df_17y = computeSum(df_17y)

In [5]:
df_21y

Unnamed: 0,ptdtrace,hefaminc,peeduca,gtmetsta,prtage,hepspre1,hepspre2,hepspre3,hepspre4,hepspre5,...,hepscon8,henohm7,hepspre_sum,hefaminc_sum,peeduca_sum,hepscon_sum,henohm_sum,gtmetsta_sum,prtage_sum,ptdtrace_sum
0,1,10,37,1,24,0,0,0,0,0,...,0,0,0,10,37,0,0,1,24,1
1,1,10,40,1,28,0,0,0,0,0,...,0,0,0,10,40,0,0,1,28,1
2,1,10,0,1,7,0,0,0,0,0,...,0,0,0,10,0,0,0,1,7,1
3,1,10,0,1,3,0,0,0,0,0,...,0,0,0,10,0,0,0,1,3,1
4,1,10,0,1,1,0,0,0,0,0,...,0,0,0,10,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127370,1,6,39,0,65,1,0,0,0,0,...,0,0,1,6,39,1,0,0,65,1
127371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
127372,1,12,37,0,61,0,0,0,0,0,...,0,0,0,12,37,0,0,0,61,1
127373,1,12,39,0,58,0,0,0,0,0,...,0,0,0,12,39,0,0,0,58,1


In [6]:
df_21_x = df_21y[["ptdtrace_sum", "hefaminc_sum", "peeduca_sum", "gtmetsta_sum", "prtage_sum"]]

def logistic(df):
    X = df[["ptdtrace_sum", "hefaminc_sum", "peeduca_sum", "gtmetsta_sum", "prtage_sum"]]
    y = df[['henohm_sum', 'hefaminc_sum', 'hepspre_sum']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
    clf_1 = LogisticRegression(random_state=0).fit(X_train, y_train['henohm_sum'])
    clf_2 = LogisticRegression(random_state=0).fit(X_train, y_train['hefaminc_sum'])
    clf_3 = LogisticRegression(random_state=0).fit(X_train, y_train['hepspre_sum'])
    
    clf_list = [clf_1, clf_2, clf_3]
    
    y_pred = [clf_1.predict(X_test), clf_2.predict(X_test), clf_3.predict(X_test)]
    
    df_index = y.columns
    df_index = df_index.tolist()
    
    for col in range(len(y.columns)):
        print("Accuracy_score q" + str((col + 1)) + ": ", accuracy_score(y_test[df_index[col]], y_pred[col]))
        print("Precision_score q" + str((col + 1)) + ": ", precision_score(y_test[df_index[col]], y_pred[col], average="macro"))
        print("Recall_score q" + str((col + 1)) + ": ", recall_score(y_test[df_index[col]], y_pred[col], average="macro"))
        print("Coefficients q" + str((col + 1)) + ": ", np.mean(np.array(clf_list[col].coef_), axis=0))
        print()


print("NTIA 2021 Internet Use Survey")
print("-----------------------------")
logistic(df_21y)
print()

print("NTIA 2019 Internet Use Survey")
print("-----------------------------")
logistic(df_19y)
print()

print("NITA 2017 Internet Use Survey")
print("-----------------------------")
logistic(df_17y)
print()

NTIA 2021 Internet Use Survey
-----------------------------
Accuracy_score q1:  0.997958799145836
Precision_score q1:  0.498979399572918
Recall_score q1:  0.5
Coefficients q1:  [ 0.00910951 -0.01998913  0.00928748 -0.2673722   0.03220045]

Accuracy_score q2:  0.4220889335510614
Precision_score q2:  0.33192967761815917
Recall_score q2:  0.3128606307180487
Coefficients q2:  [-3.29597460e-16 -2.02615702e-15 -2.23952801e-15  3.67761377e-16
 -3.06070273e-15]

Accuracy_score q3:  0.7863961813842482
Precision_score q3:  0.14403153834060925
Recall_score q3:  0.16680508795064697
Coefficients q3:  [-2.54426110e-17 -2.95249936e-15 -4.32755683e-15  8.00285764e-15
 -5.33832238e-15]


NTIA 2019 Internet Use Survey
-----------------------------
Accuracy_score q1:  0.996485466539913
Precision_score q1:  0.4982427332699565
Recall_score q1:  0.5
Coefficients q1:  [ 0.04717326 -0.05397662  0.03708343 -0.11399839  0.01173746]

Accuracy_score q2:  0.3906605594445885
Precision_score q2:  0.33464740238065505