In [11]:
import pandas as pd
import os

# Read data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/AI%20Fundamentals%20(MMAI%205000S)/AI-Human%20Pew%20Data.csv")

# Define the SMALG columns & Demographic Columns
smalg_columns = ['SMALG2_W99', 'SMALG4_a_W99', 'SMALG4_b_W99', 'SMALG4_c_W99', 
                 'SMALG4_d_W99', 'SMALG7_W99', 'SMALG11_W99', 'SMALG12_W99']

demographic_columns = ['F_INTFREQ', 'F_RELCOM3CAT', 'F_METRO', 'F_CREGION', 'F_CDIVISION', 'F_AGECAT',
                     'F_GENDER', 'F_EDUCCAT', 'F_EDUCCAT2', 'F_HISP', 
                     'F_YEARSINUS', 'F_RACECMB', 'F_RACETHNMOD', 'F_CITIZEN', 'F_BIRTHPLACE',
                     'F_MARITAL', 'F_RELIG', 'F_BORN', 'F_RELIGCAT1', 'F_ATTEND', 'F_RELIMP',
                     'F_PRAY', 'F_PARTY_FINAL', 'F_PARTYSUM_FINAL',
                     'F_PARTYSUMIDEO_FINAL', 'F_INC_SDT1', 'F_REG', 'F_IDEO', 'F_VOLSUM', 'F_INC_TIER2']

# Select the specified columns
selected_columns = smalg_columns + demographic_columns

df_selected = df[selected_columns]

# Rename the new columns to start with "DEM_"
rename_dict = {col: f"DEM_{col}" for col in selected_columns if col.startswith('F_')}
df_selected = df_selected.rename(columns=rename_dict)

# Filter out null values from SMALG columns
df_selected = df_selected.dropna(subset=smalg_columns)

# Filter out "Refused" responses from SMALG columns
for col in smalg_columns:
    df_selected = df_selected[df_selected[col] != 'Refused']

# Replace nulls and "Refused" with specified values for DEM columns
df_selected['DEM_F_INTFREQ'] = df_selected['DEM_F_INTFREQ'].replace(['Refused', None, ''], 'Several times a day')
df_selected['DEM_F_RELCOM3CAT'] = df_selected['DEM_F_RELCOM3CAT'].replace('DK/Ref', 'Medium')
df_selected['DEM_F_AGECAT'] = df_selected['DEM_F_AGECAT'].replace('Refused', '30-49')
df_selected['DEM_F_GENDER'] = df_selected['DEM_F_GENDER'].replace('Refused', 'In some other way')

# Replace everything not "Republican" or "Democrat" with "Other" in DEM_F_PARTY_FINAL
df_selected['DEM_F_PARTY_FINAL'] = df_selected['DEM_F_PARTY_FINAL'].apply(lambda x: x if x in ['Republican', 'Democrat'] else 'Other')

# Mapping for specified columns
mapping = {
    'SMALG2_W99': {'Good idea for society': 1, 'Not sure': 0.5, 'Bad idea for society': 0},
    'SMALG4_a_W99': {'Definitely happening': 0, 'Probably happening': 0.33, 'Probably NOT happening': 0.67, 'Definitely NOT happening': 1},
    'SMALG4_b_W99': {'Definitely happening': 0, 'Probably happening': 0.33, 'Probably NOT happening': 0.67, 'Definitely NOT happening': 1},
    'SMALG4_c_W99': {'Definitely happening': 1, 'Probably happening': 0.67, 'Probably NOT happening': 0.33, 'Definitely NOT happening': 0},
    'SMALG4_d_W99': {'Definitely happening': 1, 'Probably happening': 0.67, 'Probably NOT happening': 0.33, 'Definitely NOT happening': 0},
    'SMALG7_W99': {'A great deal of confidence': 1, 'A fair amount of confidence': 0.67, 'Not too much confidence': 0.33, 'No confidence at all': 0},
    'SMALG11_W99': {'Mostly made by computer programs': 1, 'Not sure': 0.5, 'A mix of both people and computer programs': 0.5, 'Mostly made by people': 0},
    'SMALG12_W99': {'A better job than humans': 1, 'About the same job as humans': 0.5, 'Not sure': 0.5, 'A worse job than humans': 0}
}

# Apply the mappings
for col, map_dict in mapping.items():
    df_selected[col] = df_selected[col].map(map_dict)

# Calculate Trust Score
weights = {
    'SMALG2_W99': 0.2143,
    'SMALG4_a_W99': 0.0476,
    'SMALG4_b_W99': 0.0476,
    'SMALG4_c_W99': 0.0476,
    'SMALG4_d_W99': 0.0476,
    'SMALG7_W99': 0.2391,
    'SMALG11_W99': 0.1667,
    'SMALG12_W99': 0.1905
}

df_selected['Trust Score'] = sum(df_selected[col] * weight for col, weight in weights.items())

# Drop SMALG columns
df_selected = df_selected.drop(columns=smalg_columns)

# Define the final dataset as TrustScoreDataset
TrustScoreDataset = df_selected

Testing Column Relevance

In [15]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def calculate_chi_squared(data, target, column):
    contingency_table = pd.crosstab(data[column], data[target])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    return chi2, p_value

def bin_numeric_column(data, column, n_bins=5):
    data[f'{column}_binned'] = pd.qcut(data[column], q=n_bins, labels=False, duplicates='drop')
    return data

results = []

for column in trust_score_data.columns:
    if column == 'Trust Score':
        continue
    
    if trust_score_data[column].dtype in ['int64', 'float64']:
        # Bin numeric columns
        trust_score_data = bin_numeric_column(trust_score_data, column)
        column = f'{column}_binned'
    
    chi2, p_value = calculate_chi_squared(trust_score_data, 'Trust Score', column)
    results.append({'Column': column, 'Chi-squared': chi2, 'p-value': p_value})

# Convert results to DataFrame and sort by Chi-squared value
results_df = pd.DataFrame(results).sort_values('Chi-squared', ascending=False)

# Format p-value to 3 decimal points
results_df['p-value'] = results_df['p-value'].apply(lambda x: '{:.3f}'.format(x))

# Display results
print(results_df.to_string(index=False))

                  Column  Chi-squared p-value
             DEM_F_RELIG 11668.485931   0.000
          DEM_F_INC_SDT1  8265.676077   0.110
         DEM_F_CDIVISION  7242.214657   0.386
              DEM_F_PRAY  6735.416337   0.000
              DEM_F_IDEO  6027.527380   0.000
           DEM_F_MARITAL  5855.121928   0.000
          DEM_F_EDUCCAT2  5694.021003   0.003
            DEM_F_ATTEND  5666.322554   0.007
DEM_F_PARTYSUMIDEO_FINAL  5370.341251   0.000
           DEM_F_RACECMB  5326.624219   0.000
        DEM_F_BIRTHPLACE  5126.720120   0.000
        DEM_F_RACETHNMOD  5110.747652   0.000
            DEM_F_RELIMP  5067.717981   0.000
         DEM_F_YEARSINUS  5037.104450   0.000
         DEM_F_RELIGCAT1  3936.469359   0.000
           DEM_F_INTFREQ  3768.264001   0.028
               DEM_F_REG  3462.375837   0.000
    DEM_F_PARTYSUM_FINAL  3300.752524   0.000
            DEM_F_AGECAT  3051.289332   0.000
           DEM_F_CITIZEN  2934.110644   0.000
         DEM_F_INC_TIER2  2870.210

Modelling

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error, r2_score
from tabulate import tabulate

# Loading the data
trust_score_data = TrustScoreDataset

# Define the columns to use
selected_columns = [
    'DEM_F_PARTYSUMIDEO_FINAL', 'DEM_F_BIRTHPLACE', 'DEM_F_RELIMP', 'DEM_F_YEARSINUS',
    'DEM_F_GENDER', 'DEM_F_IDEO', 'DEM_F_PARTY_FINAL', 'DEM_F_CITIZEN',
    'DEM_F_PARTYSUM_FINAL', 'DEM_F_REG', 'DEM_F_RACECMB', 'DEM_F_RACETHNMOD',
    'DEM_F_RELIG', 'DEM_F_PRAY', 'DEM_F_MARITAL', 'DEM_F_AGECAT', 'DEM_F_RELIGCAT1'
]

# Separate target variable
X = trust_score_data[selected_columns]
y = trust_score_data['Trust Score']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Split data into train+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train+validation into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Function to evaluate a model
def evaluate_model(model, X_train, y_train, X_eval, y_eval):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on evaluation set
    y_eval_pred = pipeline.predict(X_eval)
    
    # Calculate RMSE and R2 score
    rmse = root_mean_squared_error(y_eval, y_eval_pred)
    r2 = r2_score(y_eval, y_eval_pred)
    
    return rmse, r2

# Evaluate each model on validation and test sets
results = []
for name, model in models.items():
    val_rmse, val_r2 = evaluate_model(model, X_train, y_train, X_val, y_val)
    test_rmse, test_r2 = evaluate_model(model, X_train_val, y_train_val, X_test, y_test)
    results.append([name, val_rmse, val_r2, test_rmse, test_r2])

# Display results
print("Model Performance Results:")
headers = ["Model", "Val RMSE", "Val R2", "Test RMSE", "Test R2"]
print(tabulate(results, headers=headers, floatfmt=".4f"))

# Find the best model based on validation RMSE
best_model_name = min(results, key=lambda x: x[1])[0]
print(f'\nBest Model (based on validation RMSE): {best_model_name}')

Model Performance Results:
Model                Val RMSE    Val R2    Test RMSE    Test R2
-----------------  ----------  --------  -----------  ---------
Linear Regression      0.1750    0.2382       0.1701     0.2974
Decision Tree          0.2495   -0.5481       0.2388    -0.3850
Random Forest          0.1879    0.1221       0.1841     0.1768
SVR                    0.1814    0.1820       0.1791     0.2208
Gradient Boosting      0.1734    0.2523       0.1704     0.2947

Best Model (based on validation RMSE): Gradient Boosting


Trying Neural Network

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Loading the data
trust_score_data = TrustScoreDataset

# Define the columns to use
selected_columns = [
    'DEM_F_PARTYSUMIDEO_FINAL', 'DEM_F_BIRTHPLACE', 'DEM_F_RELIMP', 'DEM_F_YEARSINUS',
    'DEM_F_GENDER', 'DEM_F_IDEO', 'DEM_F_PARTY_FINAL', 'DEM_F_CITIZEN',
    'DEM_F_PARTYSUM_FINAL', 'DEM_F_REG', 'DEM_F_RACECMB', 'DEM_F_RACETHNMOD',
    'DEM_F_RELIG', 'DEM_F_PRAY', 'DEM_F_MARITAL', 'DEM_F_AGECAT', 'DEM_F_RELIGCAT1'
]

# Separate target variable
X = trust_score_data[selected_columns]
y = trust_score_data['Trust Score']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

# Convert sparse matrices to dense arrays
X_train_processed = X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed
X_val_processed = X_val_processed.toarray() if hasattr(X_val_processed, 'toarray') else X_val_processed
X_test_processed = X_test_processed.toarray() if hasattr(X_test_processed, 'toarray') else X_test_processed

# Function to create and compile the model
def create_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Create and train the model
input_dim = X_train_processed.shape[1]
nn_model = create_model(input_dim)
nn_model.fit(X_train_processed, y_train, epochs=100, batch_size=32, validation_data=(X_val_processed, y_val), verbose=1)

# Evaluate the model
def evaluate_nn_model(model, X, y):
    y_pred = model.predict(X).flatten()
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    return rmse, r2

val_rmse, val_r2 = evaluate_nn_model(nn_model, X_val_processed, y_val)
test_rmse, test_r2 = evaluate_nn_model(nn_model, X_test_processed, y_test)

# Add results to the previous results
results = [
    ['Neural Network', val_rmse, val_r2, test_rmse, test_r2]
]

# Display results
print("Neural Network Performance Results:")
headers = ["Model", "Val RMSE", "Val R2", "Test RMSE", "Test R2"]
print(tabulate(results, headers=headers, floatfmt=".4f"))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78