In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler, Normalizer, Imputer
import time
from scipy.stats import zscore
from datetime import datetime
from IPython.display import display, HTML

# Import Shudhi Modules

In [2]:
from shudhi_describe import shudhi_describe
from shudhi_transform import shudhi_transform

## Import Datasets: we have a small and a big dataset

In [3]:
df_four = pd.read_json('foursquare_test.json', orient='records')

In [4]:
df_four.shape

(400, 11)

In [5]:
df_complaint = pd.read_csv('Top_5_complaints.csv')

In [6]:
df_complaint.shape

(376062, 36)

## Run Shudhi Describe: Baseline

In [7]:
shudhi_describe(df_four, plot=False)


                                       [4mSUMMARY STATISTICS[0m


Unnamed: 0,Feature,Feature Type,count,# Unique,# Missing,# Outliers,mean,median,min,max
0,country,String/Object,400,1,0,,,,,
1,id,String/Object,400,400,0,,,,,
2,latitude,Real Value,400,399,0,1.0,40.75,40.74,40.6605,42.3531
3,locality,String/Object,400,5,0,,,,,
4,longitude,Real Value,400,399,0,1.0,-73.97,-73.99,-74.0159,-71.0541
5,name,String/Object,400,386,0,,,,,
6,phone,String/Object,400,199,181,,,,,
7,postal_code,Real Value saved as string,400,53,0,,,,,
8,region,String/Object,400,1,0,,,,,
9,street_address,String/Object,400,310,0,,,,,


--------------------------------------------------------------------------------------------------------------------


In [8]:
shudhi_describe(df_complaint, plot=False)


                                       [4mSUMMARY STATISTICS[0m


Unnamed: 0,Feature,Feature Type,count,# Unique,# Missing,# Outliers,mean,median,min,max
0,Unique Key,Integer,376062,376062,0,0.0,33405600.0,33750400.0,19511100.0,38948300.0
1,Created Date,String/Object,376062,334662,0,,,,,
2,Closed Date,String/Object,376062,273775,2239,,,,,
3,Agency,String/Object,376062,7,0,,,,,
4,Agency Name,String/Object,376062,39,0,,,,,
5,Complaint Type,String/Object,376062,5,0,,,,,
6,Descriptor,String/Object,376062,62,0,,,,,
7,Location Type,String/Object,376062,20,31986,,,,,
8,Incident Zip,Real Value,376062,209,2370,0.0,10795.9,11203.0,0.0,11697.0
9,Incident Address,String/Object,376062,151287,34479,,,,,


--------------------------------------------------------------------------------------------------------------------


# Run Shudhi Transform

In [9]:
# df_four has no missing in continuous variables, hence, only scaling with std scaler
# Scale for latitude and longitude; One hot for locality

df_new= shudhi_transform(df_train=df_four, cols=['latitude', 'longitude', 'locality'], 
                         scale_strategy='std', one_hot=True )



In [10]:
# The warning above is because we have both categorical and continuous features in cols field.

In [11]:
# Fill in missing values with mean and then scale with "max_abs" scaler

df_c_new= shudhi_transform(df_train=df_complaint, cols=['Latitude', 'Longitude', 'Incident Zip'], 
                           missing_strategy='mean',  scale_strategy='max_abs')

['Latitude', 'Longitude', 'Incident Zip']


## Check with describe if this has worked: Ofcourse it has!

In [13]:
shudhi_describe(df_new, plot=False)


                                       [4mSUMMARY STATISTICS[0m


Unnamed: 0,Feature,Feature Type,count,# Unique,# Missing,# Outliers,mean,median,min,max
0,country,String/Object,400,1,0,,,,,
1,id,String/Object,400,400,0,,,,,
2,latitude,Real Value,400,399,0,0.0,-0.0,-0.1,-1.07619,18.5324
3,longitude,Real Value,400,399,0,0.0,0.0,-0.09,-0.288801,19.6258
4,name,String/Object,400,386,0,,,,,
5,phone,String/Object,400,199,181,,,,,
6,postal_code,Real Value saved as string,400,53,0,,,,,
7,region,String/Object,400,1,0,,,,,
8,street_address,String/Object,400,310,0,,,,,
9,website,String/Object,400,69,0,,,,,


--------------------------------------------------------------------------------------------------------------------


In [16]:
shudhi_describe(df_c_new, cols= ['Latitude', 'Longitude', 'Incident Zip'], plot=False)


                                       [4mSUMMARY STATISTICS[0m


Unnamed: 0,Feature,Feature Type,count,# Unique,# Missing,# Outliers,mean,median,min,max
0,Latitude,Real Value,376062,164760,0,0,1.0,1.0,0.989887,1.0
1,Longitude,Real Value,376062,164664,0,0,-1.0,-1.0,-1.0,-0.992544
2,Incident Zip,Real Value,376062,210,0,0,0.92,0.96,0.0,1.0


--------------------------------------------------------------------------------------------------------------------
