### Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import r2_score
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

### Read into df

In [2]:
try:
    df = pd.read_csv("cancer_reg-1.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv("cancer_reg-1.csv", encoding='latin-1')
    except UnicodeDecodeError:
        df = pd.read_csv("cancer_reg-1.csv", encoding='cp1252')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Number of features: {len(list(df.columns))}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
display(df.head())

Dataset loaded successfully!
Shape: (3047, 34)
Number of features: 34
Columns: ['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate', 'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'binnedInc', 'MedianAge', 'MedianAgeMale', 'MedianAgeFemale', 'Geography', 'AvgHouseholdSize', 'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctSomeCol18_24', 'PctBachDeg18_24', 'PctHS25_Over', 'PctBachDeg25_Over', 'PctEmployed16_Over', 'PctUnemployed16_Over', 'PctPrivateCoverage', 'PctPrivateCoverageAlone', 'PctEmpPrivCoverage', 'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack', 'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate']

First 5 rows:


Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,...,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [3]:
print(df.isnull().sum()[df.isnull().sum() > 0]) if df.isnull().sum().sum() > 0 else print("No missing values found!")

# Min and max
min_list = dict()
max_list = dict()
numeric_columns = df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    min_list[column] = df[column].min() 
    max_list[column] = df[column].max() 
print("Min values:", min_list)
print("Max values:", max_list)
print("\nUnique values in binnedInc column:")
unique_values = df["binnedInc"].unique()
print(len(unique_values))



PctSomeCol18_24            2285
PctEmployed16_Over          152
PctPrivateCoverageAlone     609
dtype: int64
Min values: {'avgAnnCount': 6.0, 'avgDeathsPerYear': 3, 'TARGET_deathRate': 59.7, 'incidenceRate': 201.3, 'medIncome': 22640, 'popEst2015': 827, 'povertyPercent': 3.2, 'studyPerCap': 0.0, 'MedianAge': 22.3, 'MedianAgeMale': 22.4, 'MedianAgeFemale': 22.3, 'AvgHouseholdSize': 0.0221, 'PercentMarried': 23.1, 'PctNoHS18_24': 0.0, 'PctHS18_24': 0.0, 'PctSomeCol18_24': 7.1, 'PctBachDeg18_24': 0.0, 'PctHS25_Over': 7.5, 'PctBachDeg25_Over': 2.5, 'PctEmployed16_Over': 17.6, 'PctUnemployed16_Over': 0.4, 'PctPrivateCoverage': 22.3, 'PctPrivateCoverageAlone': 15.7, 'PctEmpPrivCoverage': 13.5, 'PctPublicCoverage': 11.2, 'PctPublicCoverageAlone': 2.6, 'PctWhite': 10.1991551, 'PctBlack': 0.0, 'PctAsian': 0.0, 'PctOtherRace': 0.0, 'PctMarriedHouseholds': 22.99248989, 'BirthRate': 0.0}
Max values: {'avgAnnCount': 38150.0, 'avgDeathsPerYear': 14010, 'TARGET_deathRate': 362.8, 'incidenceRate': 120

In [4]:
def extract_mean_from_bin(bin_string):
    # Remove brackets and parentheses
    cleaned = bin_string.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
    # Split by comma and convert to float
    values = [float(x.strip()) for x in cleaned.split(',')]
    # Return the mean
    return (values[0] + values[1]) / 2

In [5]:
# Fill missing values in PctEmployed16_Over with mean (CORRECTED)
median_employed = df['PctEmployed16_Over'].median()
df['PctEmployed16_Over'] = df['PctEmployed16_Over'].fillna(median_employed)

median_privcover = df['PctPrivateCoverageAlone'].median()
df['PctPrivateCoverageAlone'] = df['PctPrivateCoverageAlone'].fillna(median_privcover)

In [6]:
# Replace binnedInc with meanbin
df['meanbin'] = df['binnedInc'].apply(extract_mean_from_bin)

In [7]:
dropcol = ['binnedInc','PctSomeCol18_24','Geography']
df = df.drop(dropcol,axis=1)

In [8]:
y = df['TARGET_deathRate']               
X = df.drop('TARGET_deathRate', axis=1)  


# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42,    
    stratify=None       
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
reg=linear_model.LinearRegression()
reg.fit(X_train_scaled,y_train)
y_pred = reg.predict(X_test_scaled)
r2_score(y_test,y_pred)

0.49392868531212286

In [10]:
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(1)
])

model_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_4 = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
model_1.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train Model
model_1.fit(X_train_scaled, y_train, epochs=100, batch_size=16, verbose=0)

# Evaluate Model
loss, mae = model_1.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Mean Absolute Error: {mae:.2f}")

# Make Predictions
predictions = model_1.predict(X_test_scaled)
r2_score(y_test,predictions)

Test Mean Absolute Error: nan
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


ValueError: Input contains NaN.