In [1]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd 
import sklearn

## Importing and Cleaning Training Dataset

In [2]:
# Read CSV 
data = pd.read_csv('../../../Resources/bio_vars_frame.csv')
data = data[['raster_frame', 'Precipitation of Driest Month', 'Temperature Seasonality', 'Precipitation of Driest Quarter',
             'Annual Precipitation', 'Precipitation of Warmest Quarter']]
data.head()

Unnamed: 0,raster_frame,Precipitation of Driest Month,Temperature Seasonality,Precipitation of Driest Quarter,Annual Precipitation,Precipitation of Warmest Quarter
0,0,25.0,5390.0,144.0,2184.0,147.0
1,0,24.0,5385.0,142.0,2196.0,146.0
2,0,24.0,5407.0,140.0,2162.0,144.0
3,0,24.0,5437.0,139.0,2144.0,143.0
4,0,24.0,5438.0,138.0,2188.0,143.0


## Preprocessing the data for model training-Processing

In [3]:
X = data.drop('raster_frame', axis=1)
y = data['raster_frame']
print(X.shape, y.shape)

(1684856, 5) (1684856,)


### Split Data

In [4]:
# import train_test_split and split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Visualize Data

In [5]:
# Scatter matrix to visualize relationships between input variables

# scatter_df = pd.DataFrame(X_train.sample(n=500), columns= X.columns)
# scatter_matrix = pd.plotting.scatter_matrix(scatter_df, c=y_train.sample(n=500), figsize=(50,50), marker='o')

### Instantiate Model

In [6]:
# import model
from sklearn.linear_model import LogisticRegression

# instantiate model
model = LogisticRegression() 

## Making a prediction on test data set

In [7]:
# fit model to the training data
model.fit(X_train, y_train)
# calculate scores of training and testing data
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
# print scores 
print(f'Training Score: {training_score}')
print(f'Testing Score: {testing_score}')


Training Score: 0.9091182470984662
Testing Score: 0.9090462330311908


In [8]:
coef = model.coef_[0]
print(coef)

[-1.33581961e-01 -1.13168868e-04  1.00395415e-01 -4.58611326e-03
 -4.96049281e-02]


In [9]:
training_prob = model.predict_proba(X_train)
testing_prob = model.predict_proba(X_test)


print(f'Training Probability: {training_prob}')
print(f'Testing Probability: {testing_prob}')

Training Probability: [[0.84017739 0.15982261]
 [0.94048908 0.05951092]
 [0.97624937 0.02375063]
 ...
 [0.96859272 0.03140728]
 [0.87407945 0.12592055]
 [0.89490301 0.10509699]]
Testing Probability: [[0.93384329 0.06615671]
 [0.92020449 0.07979551]
 [0.8725379  0.1274621 ]
 ...
 [0.9982763  0.0017237 ]
 [0.80231806 0.19768194]
 [0.8459095  0.1540905 ]]


In [10]:
train_test_results = pd.DataFrame({"Prediction": testing_prob[:,1], "Actual": y_test})
train_test_results.head(10)

Unnamed: 0,Prediction,Actual
764786,0.066157,0
351495,0.079796,1
1210042,0.127462,0
1074720,0.000379,0
1587825,0.038057,0
1439783,0.127314,0
257830,0.043222,0
530628,0.049369,0
1557436,0.082014,0
1288260,0.159332,0


## Importing and cleaning new data set

In [12]:
harsh_2050 = pd.read_csv('../../../Resources/2050_harsh.csv')

In [13]:
harsh_2050.drop(['Unnamed: 0'], axis = 1, inplace = True)
harsh_2050.dropna(axis = 0, how = 'any', inplace = True)
harsh_2050.rename(columns={'bc85bi501': 'Annual Mean Temperature', 
                     'bc85bi502': 'Mean Diurnal Range (Mean of monthly (max temp - min temp))',
                     'bc85bi503': 'Isothermality',
                     'bc85bi504': 'Temperature Seasonality',
                     'bc85bi505': 'Max Temperature of Warmest Month',
                     'bc85bi506': 'Min Temperature of Coldest Month',
                     'bc85bi507': 'Temperature Annual Range',
                     'bc85bi508': 'Mean Temperature of Wettest Quarter',
                     'bc85bi509': 'Mean Temperature of Driest Quarter',
                     'bc85bi5010': 'Mean Temperature of Warmest Quarter',
                     'bc85bi5011': 'Mean Temperature of Coldest Quarter',
                     'bc85bi5012': 'Annual Precipitation',
                     'bc85bi5013': 'Precipitation of Wettest Month',
                     'bc85bi5014': 'Precipitation of Driest Month',
                     'bc85bi5015': 'Precipitation Seasonality (Coefficient of Variation)',
                     'bc85bi5016': 'Precipitation of Wettest Quarter',
                     'bc85bi5017': 'Precipitation of Driest Quarter',
                     'bc85bi5018': 'Precipitation of Warmest Quarter',
                     'bc85bi5019': 'Precipitation of Coldest Quarter',
                    }, inplace=True)


In [14]:
harsh_2050.head()

Unnamed: 0,x,y,Annual Mean Temperature,Mean Temperature of Warmest Quarter,Mean Temperature of Coldest Quarter,Annual Precipitation,Precipitation of Wettest Month,Precipitation of Driest Month,Precipitation Seasonality (Coefficient of Variation),Precipitation of Wettest Quarter,...,Precipitation of Warmest Quarter,Precipitation of Coldest Quarter,Mean Diurnal Range (Mean of monthly (max temp - min temp)),Isothermality,Temperature Seasonality,Max Temperature of Warmest Month,Min Temperature of Coldest Month,Temperature Annual Range,Mean Temperature of Wettest Quarter,Mean Temperature of Driest Quarter
0,-124.779167,49.3375,109,182,48,1925,361,29,67,916,...,145,839,87,35,5383,259,12,247,56,181
1,-124.770833,49.3375,106,177,47,1944,363,29,67,923,...,148,690,86,35,5286,253,10,243,54,176
2,-124.7625,49.3375,104,176,45,1950,365,30,67,924,...,150,691,86,35,5296,251,8,243,53,175
3,-124.754167,49.3375,111,184,50,1893,354,28,68,902,...,141,825,87,35,5390,262,14,248,58,184
4,-124.745833,49.3375,119,193,54,1849,346,25,69,887,...,132,812,90,35,5524,274,20,254,63,192


In [15]:
X_new = harsh_2050[['Precipitation of Driest Month', 'Temperature Seasonality', 'Precipitation of Driest Quarter',
             'Annual Precipitation', 'Precipitation of Warmest Quarter']]

X_new.head(5)

Unnamed: 0,Precipitation of Driest Month,Temperature Seasonality,Precipitation of Driest Quarter,Annual Precipitation,Precipitation of Warmest Quarter
0,29,5383,122,1925,145
1,29,5286,125,1944,148
2,30,5296,128,1950,150
3,28,5390,119,1893,141
4,25,5524,109,1849,132


## Making a prediction on our new data

In [16]:
harsh_2050_prediction = model.predict_proba(X_new)

## Creating final dataframe and exporting

In [17]:
harsh_LogReg_2050 = harsh_2050[['x', 'y']]
harsh_LogReg_2050["Prediction"] = harsh_2050_prediction[:,1]
harsh_LogReg_2050.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,x,y,Prediction
0,-124.779167,49.3375,0.000254
1,-124.770833,49.3375,0.000274
2,-124.7625,49.3375,0.000286
3,-124.754167,49.3375,0.000303
4,-124.745833,49.3375,0.000313


In [18]:
harsh_LogReg_2050.to_csv('harsh_LogReg_2050.csv')