In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
nuc_cancer_df = pd.read_csv('../cleaned_data/ML_data_nuc_cancer.csv', dtype={'GEOID': str})
nuc_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County_State,closest_plant,distance,plant_capacity,State,Rate,bladder,...,lung,melanoma,non-hodgkins_lymphoma,oral_cavity,ovary,pancreas,prostate,stomach,thyroid,uterus
0,32.53492,-86.642749,1001,"Autauga County, Alabama",Joseph M. Farley Nuclear Plant,128.0,1776.4,"Autauga County, Alabama",506.4,15.8,...,58.6,28.2,10.7,0.0,12.9,9.7,158.0,0.0,0.0,25.5
1,30.66097,-87.74984,1003,"Baldwin County, Alabama",Joseph M. Farley Nuclear Plant,161.0,1776.4,"Baldwin County, Alabama",455.7,23.1,...,55.9,28.1,14.8,8.2,11.8,10.0,91.8,8.1,3.8,17.6
2,31.869603,-85.393197,1005,"Barbour County, Alabama",Joseph M. Farley Nuclear Plant,48.0,1776.4,"Barbour County, Alabama",447.2,13.3,...,40.1,24.9,0.0,0.0,0.0,0.0,162.6,0.0,0.0,22.7
3,32.998644,-87.126439,1007,"Bibb County, Alabama",Browns Ferry Nuclear Plant,118.0,3567.5,"Bibb County, Alabama",466.1,19.8,...,61.9,0.0,0.0,0.0,0.0,0.0,112.1,0.0,0.0,25.3
4,33.980867,-86.567371,1009,"Blount County, Alabama",Browns Ferry Nuclear Plant,59.0,3567.5,"Blount County, Alabama",438.7,17.4,...,49.8,16.5,16.1,0.0,14.1,11.5,96.9,0.0,9.4,23.5


In [3]:
nuc_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County_State', 'closest_plant',
       'distance', 'plant_capacity', 'State', 'Rate', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus'],
      dtype='object')

In [4]:
# Define the features set.
X = nuc_cancer_df.copy()
X = X.drop(['latitude', 'longitude', 'GEOID', 'County_State', 'closest_plant', 'State', 'Rate', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus'], axis=1)
X.head()

Unnamed: 0,distance,plant_capacity
0,128.0,1776.4
1,161.0,1776.4
2,48.0,1776.4
3,118.0,3567.5
4,59.0,3567.5


In [5]:
# Define the target set.
y = nuc_cancer_df["uterus"]
y

0       25.5
1       17.6
2       22.7
3       25.3
4       23.5
        ... 
2875    16.6
2876     0.0
2877     0.0
2878     0.0
2879     0.0
Name: uterus, Length: 2880, dtype: float64

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [8]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators = 1024, random_state = 78)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))

0.7932430279484993
0.12241640208144189


### Lasso Model

In [12]:
# Fit Lasso regression model
lasso = Lasso(alpha=0.001)
lasso = lasso.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = lasso.predict(X_test_scaled)

In [14]:
print(lasso.score(X_train_scaled, y_train))
print(lasso.score(X_test_scaled, y_test))

0.051214612204472765
0.0582520374783243
