# Classification of rodent data based on Support Vector machines

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from numpy import linalg
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import os

## Loading the merged data set

The target variable used is the num_sightings i.e., the number rodent sightings

In [None]:
parent_dir = os.path.abspath('..')  # get the absolute path of the parent directory
file_path = os.path.join(parent_dir, 'Data', 'rodents_per_year_merged.csv') 
data=pd.read_csv(file_path)
data_20 = data[(data['year']>2017) & (data['year']<2020)]
col=['spatial_id','year','num_dsny_complaints','num_violations','num_sightings']
sightings = data_20['num_sightings']
X_20 = data_20.drop(col,axis=1)
sightings.head()

### Considering Median Household income and Subway stratification

In [None]:
col=['spatial_id','year','num_dsny_complaints','num_violations','num_sightings']
X_20 = data_20.drop(col,axis=1)

# Dropping all the features that are trivially associated with Rodent sightings
X_20 = X_20.loc[:,~((X_20.columns.str.startswith('l_')) | (X_20.columns.str.startswith('t_')) | (X_20.columns.str.startswith('d_')) )] #

# Assessing for stratification in the data by filtering the data prior to analysis
X_20 = X_20[(X_20['subway_count']>3) & (X_20['MedianHouseholdIncome']>0)]
sightings = data_20[['num_sightings','subway_count','MedianHouseholdIncome']]
sightings = sightings[(sightings['subway_count']>3) & (X_20['MedianHouseholdIncome']>0)]
sightings.drop(['subway_count','MedianHouseholdIncome'],axis=1,inplace=True)
X_20.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_20,sightings, test_size=0.4, random_state=90)
clf = svm.SVC(kernel='rbf',C=1e3)
clf.fit(X_train, Y_train.values.ravel())

In [None]:
training_accuracy = clf.score(X_train, Y_train)
print("In-sample accuracy: {:.5f}".format(training_accuracy))

# Evaluate out-of-sample (test set) accuracy
test_accuracy = clf.score(X_test, Y_test)
print("Out-of-sample accuracy: {:.5f}".format(test_accuracy))

### Permutation importance for Training set

In [None]:
result = permutation_importance(clf, X_train, Y_train, n_repeats=10, random_state=0)
importances = result.importances_mean

### Permutation importance for Test set

In [None]:
feature_imp = pd.DataFrame(columns=X_20.columns)
feature_imp.loc[0]=importances 
print(feature_imp.T)

In [None]:
result = permutation_importance(clf, X_test, Y_test, n_repeats=10, random_state=0)
importances = result.importances_mean
feature_imp = pd.DataFrame(columns=X_20.columns)
feature_imp.loc[0]=importances 
print(feature_imp.T)

## Analysis repeated after dropping Median Household income

In [None]:
col=['spatial_id','year','num_dsny_complaints','num_violations','num_sightings','MedianHouseholdIncome']
X_20 = data_20.drop(col,axis=1)
X_20 = X_20.loc[:,~((X_20.columns.str.startswith('l_')) | (X_20.columns.str.startswith('t_')) | (X_20.columns.str.startswith('d_')) )] #
X_20 = X_20[(X_20['subway_count']>0)]
sightings = data_20[['num_sightings','subway_count']]
sightings = sightings[(sightings['subway_count']>0)]
sightings.drop(['subway_count'],axis=1,inplace=True)
X_20.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_20,sightings, test_size=0.4, random_state=90)
clf = svm.SVC(kernel='rbf',C=1e3)
clf.fit(X_train, Y_train.values.ravel())

In [None]:
training_accuracy = clf.score(X_train, Y_train)
print("In-sample accuracy: {:.5f}".format(training_accuracy))

# Evaluate out-of-sample (test set) accuracy
test_accuracy = clf.score(X_test, Y_test)
print("Out-of-sample accuracy: {:.5f}".format(test_accuracy))

### Permutation importance for Training set

In [None]:
result = permutation_importance(clf, X_train, Y_train, n_repeats=10, random_state=0)
importances = result.importances_mean
feature_imp = pd.DataFrame(columns=X_20.columns)
feature_imp.loc[0]=importances 
print(feature_imp.T)

### Permutation importance for Training set

In [None]:
result = permutation_importance(clf, X_test, Y_test, n_repeats=10, random_state=0)
importances = result.importances_mean
feature_imp = pd.DataFrame(columns=X_20.columns)
feature_imp.loc[0]=importances 
print(feature_imp.T)