In [2]:
from sklearn import tree
import pandas as pd
import os

  ## Importing and cleaning the training dataset

In [14]:
data = pd.read_csv('../../Resources/bio_vars_frame.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,bio_9,...,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19,raster_frame
0,1,96.0,105.0,40.0,5390.0,250.0,-7.0,257.0,35.0,165.0,...,30.0,2184.0,350.0,25.0,61.0,1008.0,144.0,147.0,951.0,0
1,2,94.0,105.0,40.0,5385.0,249.0,-9.0,258.0,34.0,163.0,...,28.0,2196.0,354.0,24.0,62.0,1019.0,142.0,146.0,961.0,0
2,3,96.0,105.0,40.0,5407.0,251.0,-7.0,258.0,35.0,165.0,...,30.0,2162.0,348.0,24.0,62.0,1002.0,140.0,144.0,945.0,0
3,4,97.0,105.0,40.0,5437.0,251.0,-7.0,258.0,35.0,166.0,...,30.0,2144.0,345.0,24.0,62.0,993.0,139.0,143.0,937.0,0
4,5,93.0,105.0,40.0,5438.0,248.0,-11.0,259.0,32.0,163.0,...,27.0,2188.0,357.0,24.0,63.0,1023.0,138.0,143.0,966.0,0


In [None]:
data.drop(['Unnamed: 0'], axis = 1, inplace = True)
data.dropna(axis = 0, how = 'any', inplace = True)
data.rename(columns={'bio_1': 'Annual Mean Temperature', 
                     'bio_2': 'Mean Diurnal Range (Mean of monthly (max temp - min temp))',
                     'bio_3': 'Isothermality',
                     'bio_4': 'Temperature Seasonality',
                     'bio_5': 'Max Temperature of Warmest Month',
                     'bio_6': 'Min Temperature of Coldest Month',
                     'bio_7': 'Temperature Annual Range',
                     'bio_8': 'Mean Temperature of Wettest Quarter',
                     'bio_9': 'Mean Temperature of Driest Quarter',
                     'bio_10': 'Mean Temperature of Warmest Quarter',
                     'bio_11': 'Mean Temperature of Coldest Quarter',
                     'bio_12': 'Annual Precipitation',
                     'bio_13': 'Precipitation of Wettest Month',
                     'bio_14': 'Precipitation of Driest Month',
                     'bio_15': 'Precipitation Seasonality (Coefficient of Variation)',
                     'bio_16': 'Precipitation of Wettest Quarter',
                     'bio_17': 'Precipitation of Driest Quarter',
                     'bio_18': 'Precipitation of Warmest Quarter',
                     'bio_19': 'Precipitation of Coldest Quarter',
                    }, inplace=True)

data = data[['raster_frame','Precipitation of Driest Month', 'Temperature Seasonality', 'Precipitation of Driest Quarter',
             'Annual Precipitation', 'Precipitation of Warmest Quarter']]

data.head()

## Preprocessing the data for model training

In [21]:
target = data['raster_frame']
target_names = ["negative", "positive"]

In [22]:
X = data.drop(["raster_frame"], axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,Precipitation of Driest Month,Temperature Seasonality,Precipitation of Driest Quarter,Annual Precipitation,Precipitation of Warmest Quarter
0,25.0,5390.0,144.0,2184.0,147.0
1,24.0,5385.0,142.0,2196.0,146.0
2,24.0,5407.0,140.0,2162.0,144.0
3,24.0,5437.0,139.0,2144.0,143.0
4,24.0,5438.0,138.0,2188.0,143.0


In [18]:
# Splitting up the data for training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_data, target, random_state=42)

In [None]:
# Testing accuracy on training and testing data
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## Making a prediction on test data set

In [23]:
# Making prediction based on test data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, y_train)
prediction = rf.predict_proba(X_test)
print(prediction)

[[1.   0.  ]
 [0.58 0.42]
 [1.   0.  ]
 ...
 [1.   0.  ]
 [1.   0.  ]
 [1.   0.  ]]


In [24]:
# Looking at important of each feature
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.40939007993657767, 'Temperature Seasonality'),
 (0.27623199779648944, 'Annual Precipitation'),
 (0.14160813538841294, 'Precipitation of Warmest Quarter'),
 (0.10417259395132726, 'Precipitation of Driest Quarter'),
 (0.06859719292719256, 'Precipitation of Driest Month')]

## Importing and cleaning new data set

In [36]:
harsh_2050 = pd.read_csv('../../Resources/2070_mild.csv')

In [37]:
harsh_2050.drop(['Unnamed: 0'], axis = 1, inplace = True)
harsh_2050.dropna(axis = 0, how = 'any', inplace = True)
harsh_2050.rename(columns={'bc85bi501': 'Annual Mean Temperature', 
                     'bc85bi5010': 'Mean Diurnal Range (Mean of monthly (max temp - min temp))',
                     'bc85bi5011': 'Isothermality',
                     'bc85bi5012': 'Temperature Seasonality',
                     'bc85bi5013': 'Max Temperature of Warmest Month',
                     'bc85bi5014': 'Min Temperature of Coldest Month',
                     'bc85bi5015': 'Temperature Annual Range',
                     'bc85bi5016': 'Mean Temperature of Wettest Quarter',
                     'bc85bi5017': 'Mean Temperature of Driest Quarter',
                     'bc85bi5018': 'Mean Temperature of Warmest Quarter',
                     'bc85bi5019': 'Mean Temperature of Coldest Quarter',
                     'bc85bi502': 'Annual Precipitation',
                     'bc85bi503': 'Precipitation of Wettest Month',
                     'bc85bi504': 'Precipitation of Driest Month',
                     'bc85bi505': 'Precipitation Seasonality (Coefficient of Variation)',
                     'bc85bi506': 'Precipitation of Wettest Quarter',
                     'bc85bi507': 'Precipitation of Driest Quarter',
                     'bc85bi508': 'Precipitation of Warmest Quarter',
                     'bc85bi509': 'Precipitation of Coldest Quarter',
                    }, inplace=True)

harsh_2050.head()

Unnamed: 0,x,y,Annual Mean Temperature,Mean Diurnal Range (Mean of monthly (max temp - min temp)),Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,...,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality (Coefficient of Variation),Precipitation of Wettest Quarter,Precipitation of Driest Quarter,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter
0,-124.779167,49.3375,109,182,48,1925,361,29,67,916,...,145,839,87,35,5383,259,12,247,56,181
1,-124.770833,49.3375,106,177,47,1944,363,29,67,923,...,148,690,86,35,5286,253,10,243,54,176
2,-124.7625,49.3375,104,176,45,1950,365,30,67,924,...,150,691,86,35,5296,251,8,243,53,175
3,-124.754167,49.3375,111,184,50,1893,354,28,68,902,...,141,825,87,35,5390,262,14,248,58,184
4,-124.745833,49.3375,119,193,54,1849,346,25,69,887,...,132,812,90,35,5524,274,20,254,63,192


In [40]:
x_new = harsh_2050[['Precipitation of Driest Month', 'Temperature Seasonality', 'Precipitation of Driest Quarter',
             'Annual Precipitation', 'Precipitation of Warmest Quarter']]
x_new.head()

Unnamed: 0,Precipitation of Driest Month,Temperature Seasonality,Precipitation of Driest Quarter,Annual Precipitation,Precipitation of Warmest Quarter
0,5383,1925,247,87,56
1,5286,1944,243,86,54
2,5296,1950,243,86,53
3,5390,1893,248,87,58
4,5524,1849,254,90,63


## Making a prediction on our new data

In [41]:
harsh2050_prediction = rf.predict_proba(x_new)

## Creating final dataframe and exporting

In [42]:
harsh2050_final = harsh_2050[['x', 'y']]
harsh2050_final["Prediction"] = harsh2050_prediction[:,1]
harsh2050_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,x,y,Prediction
0,-124.779167,49.3375,0.13
1,-124.770833,49.3375,0.2
2,-124.7625,49.3375,0.2
3,-124.754167,49.3375,0.146667
4,-124.745833,49.3375,0.08


In [43]:
harsh2050_final.to_csv('2050_harsh_final.csv')