## Wrangling Data: 
### Manual and Automation Examples
### Stolen Vehicles With Dictionary Comprehension and enumerate()

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
%matplotlib inline

### Reading Data into dataframe
####Producing first five elements of csv to provide a visual



In [2]:
vehicles = pd.read_csv('vehicles.csv')
vehicles.head()

Unnamed: 0,make,model,year,color,curr_mileage,purchase_price,curr_value,stolen
0,Volkswagen,GTI,2015,black,152000,32000,9000,0
1,Ford,Focus,2008,blue,225000,14500,3500,0
2,Dodge,Durango,1998,blue,235500,19000,2000,0
3,BMW,325i,2004,silver,104200,32000,8200,0
4,Ford,Mustang,1974,yellow,325000,4200,7800,1


In [3]:
vehicles.make.unique()

array(['Volkswagen', 'Ford', 'Dodge', 'BMW', 'Honda', 'Toyota', 'Jeep',
       'Nissan', 'Chevrolet', 'Subaru', 'Lexus'], dtype=object)

### Here, I manually created a dictionary and associating key values to them to be used as a reference allowing me to turn categorical data into integer data. Classification data needs to be discrete in order to perform proper analysis. 



In [4]:
 map_make = {'Volkswagen':0, 'Ford':1, 'Dodge':2, 'BMW':3, 'Honda':4, 'Toyota':5, 'Jeep':6,
       'Nissan':7, 'Chevrolet':8, 'Subaru':9, 'Lexus':10}
vehicles.replace({'make': map_make}, inplace=True)
vehicles.head()

Unnamed: 0,make,model,year,color,curr_mileage,purchase_price,curr_value,stolen
0,0,GTI,2015,black,152000,32000,9000,0
1,1,Focus,2008,blue,225000,14500,3500,0
2,2,Durango,1998,blue,235500,19000,2000,0
3,3,325i,2004,silver,104200,32000,8200,0
4,1,Mustang,1974,yellow,325000,4200,7800,1


### Using dictionary comprehension and enumerate()


Used dictionary comprehension to automate the whole process.

Making the list unique, so the tuple contains an index rather than a count.

In [5]:
models = np.unique(vehicles['model'])
map_model = {k: v for v, k in enumerate(models)}
vehicles.replace({'model': map_model}, inplace=True)
vehicles.head()

Unnamed: 0,make,model,year,color,curr_mileage,purchase_price,curr_value,stolen
0,Volkswagen,15,2015,black,152000,32000,9000,Not stolen
1,Ford,13,2008,blue,225000,14500,3500,Not stolen
2,Dodge,11,1998,blue,235500,19000,2000,Not stolen
3,BMW,0,2004,silver,104200,32000,8200,Not stolen
4,Honda,2,1995,silver,185800,8500,2000,Not stolen


### Repeated this process for other columns.

In [6]:
makes = np.unique(vehicles['make'])
map_make = {k: v for v, k in enumerate(makes)}
vehicles.replace({'make': map_make}, inplace=True)

colors = np.unique(vehicles['color'])
map_color = {k: v for v, k in enumerate(colors)}
vehicles.replace({'color': map_color}, inplace=True)

stolens = np.unique(vehicles['stolen'])
map_stolens = {k: v for v, k in enumerate(stolens)}
vehicles.replace({'stolen': map_stolens}, inplace=True)
vehicles.head()

Unnamed: 0,make,model,year,color,curr_mileage,purchase_price,curr_value,stolen
0,10,15,2015,0,152000,32000,9000,0
1,3,13,2008,1,225000,14500,3500,0
2,2,11,1998,1,235500,19000,2000,0
3,0,0,2004,3,104200,32000,8200,0
4,4,2,1995,3,185800,8500,2000,0


### Created X and y datasets, split datasets, and fit the KNN, to obtain score

In [7]:
y = vehicles['stolen']
X = vehicles.drop(['stolen'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
print("Score: ", knn.score(X_test, y_test).round(3))

Score:  0.714


### Created sample data to make predictions with KNN machine



    # make | model | year | color | mileage | purchase_price | current_value
    [3,3,2016,0,185000,22450,8250]
    [0,0,2022,0,6000,74000,64800]
    
    Using these predictors to deteremine if a vehicle will be stolen. 
    Stolen or Not Stolen = Target Data

In [8]:
# re-fit model with values only to avoid warning
knn.fit(X_train.values, y_train.values)

prediction = knn.predict([[3,3,2016,0,185000,22450,8250]])
stolens[prediction[0]]

'Not stolen'

In [9]:
prediction = knn.predict([[0,0,2022,0,6000,74000,64800]])
stolens[prediction[0]]

'Stolen'