In [15]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
ff_cancer_df = pd.read_csv('../cleaned_data/ML_data_ff_cancer.csv', dtype={'GEOID': str})
ff_cancer_df.head()

Unnamed: 0,latitude,longitude,GEOID,County,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,...,lung,melanoma,non-hodgkins_lymphoma,oral_cavity,ovary,pancreas,prostate,stomach,thyroid,uterus
0,32.53492,-86.642749,1001,Autauga County,939.4,50.521,3.849,762545.203,28447.358,2844.736,...,58.6,28.2,10.7,0.0,12.9,9.7,158.0,0.0,0.0,25.5
1,30.66097,-87.74984,1003,Baldwin County,50.0,450.864,4.58,167490.328,6318.013,631.801,...,55.9,28.1,14.8,8.2,11.8,10.0,91.8,8.1,3.8,17.6
2,31.869603,-85.393197,1005,Barbour County,120.5,312.818,0.59,0.0,134642.958,24655.8,...,40.1,24.9,0.0,0.0,0.0,0.0,162.6,0.0,0.0,22.7
3,32.998644,-87.126439,1007,Bibb County,13.0,16.113,2.219,0.011,12526.086,1644.049,...,61.9,0.0,0.0,0.0,0.0,0.0,112.1,0.0,0.0,25.3
4,33.980867,-86.567371,1009,Blount County,3.8,2.197,0.009,1006.565,37.975,3.79,...,49.8,16.5,16.1,0.0,14.1,11.5,96.9,0.0,9.4,23.5


In [3]:
ff_cancer_df.columns

Index(['latitude', 'longitude', 'GEOID', 'County', 'nameplate_capacity_MW1',
       'NOx_tons1', 'SO2_tons1', 'CO2_tons1', 'CH4_lbs1', 'N2O_lbs1',
       'PM2.5_tons1', 'dist_from_county1', 'nameplate_capacity_MW2',
       'NOx_tons2', 'SO2_tons2', 'CO2_tons2', 'CH4_lbs2', 'N2O_lbs2',
       'PM2.5_tons2', 'dist_from_county2', 'nameplate_capacity_MW3',
       'NOx_tons3', 'SO2_tons3', 'CO2_tons3', 'CH4_lbs3', 'N2O_lbs3',
       'PM2.5_tons3', 'dist_from_county3', 'nameplate_capacity_MW4',
       'NOx_tons4', 'SO2_tons4', 'CO2_tons4', 'CH4_lbs4', 'N2O_lbs4',
       'PM2.5_tons4', 'dist_from_county4', 'nameplate_capacity_MW5',
       'NOx_tons5', 'SO2_tons5', 'CO2_tons5', 'CH4_lbs5', 'N2O_lbs5',
       'PM2.5_tons5', 'dist_from_county5', 'fuel_type1_Biomass',
       'fuel_type1_Coal', 'fuel_type1_Gas', 'fuel_type1_Oil',
       'fuel_type1_Other Fossil', 'fuel_type2_Biomass', 'fuel_type2_Coal',
       'fuel_type2_Gas', 'fuel_type2_Oil', 'fuel_type2_Other Fossil',
       'fuel_type3_Biomas

In [4]:
# Define the features set.
X = ff_cancer_df.copy()
X = X.drop(['latitude', 'longitude', 'GEOID', 'County', 'State', 'Rate', 'bladder', 'brain',
       'breast', 'breast_insitu', 'cervix', 'colon', 'esophagus',
       'kidney_and_renal', 'leukemia', 'liver', 'lung', 'melanoma',
       'non-hodgkins_lymphoma', 'oral_cavity', 'ovary', 'pancreas', 'prostate',
       'stomach', 'thyroid', 'uterus'], axis=1)
X.head()

Unnamed: 0,nameplate_capacity_MW1,NOx_tons1,SO2_tons1,CO2_tons1,CH4_lbs1,N2O_lbs1,PM2.5_tons1,dist_from_county1,nameplate_capacity_MW2,NOx_tons2,...,fuel_type4_Biomass,fuel_type4_Coal,fuel_type4_Gas,fuel_type4_Oil,fuel_type4_Other Fossil,fuel_type5_Biomass,fuel_type5_Coal,fuel_type5_Gas,fuel_type5_Oil,fuel_type5_Other Fossil
0,939.4,50.521,3.849,762545.203,28447.358,2844.736,36.930022,10.0,927.1,181.543,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50.0,450.864,4.58,167490.328,6318.013,631.801,7.857154,19.0,280.0,41.54,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,120.5,312.818,0.59,0.0,134642.958,24655.8,1.386006,30.0,13.6,18.862,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,13.0,16.113,2.219,0.011,12526.086,1644.049,0.0,31.0,939.4,50.521,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3.8,2.197,0.009,1006.565,37.975,3.79,0.154237,30.0,138.0,136.388,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
# Define the target set.
y = ff_cancer_df["uterus"]
y

0       25.5
1       17.6
2       22.7
3       25.3
4       23.5
        ... 
2822    16.6
2823     0.0
2824     0.0
2825     0.0
2826     0.0
Name: uterus, Length: 2827, dtype: float64

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [8]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators = 1024, random_state = 78)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))

0.8945266897162232
0.24230137940183805


### Lasso Model

In [12]:
# Fit Lasso regression model
lasso = Lasso(alpha=0.02)
lasso = lasso.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = lasso.predict(X_test_scaled)

In [14]:
print(lasso.score(X_train_scaled, y_train))
print(lasso.score(X_test_scaled, y_test))

0.20881397444613636
0.14926757554972092
