<a href="https://colab.research.google.com/github/Watirboi/data_science/blob/main/SA_Traffic_Stop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analyzing Traffic Data in San Antonio, TX

## Importing the Libraries

In [None]:
import pandas as pd
# Suppress scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

## Importing the Datasets

In [None]:
filename = 'tx_san_antonio_2020_04_01.csv'
names = ['date','time','location','substation','subject_age','subject_sex','violation','subject_race']
sa_traffic = pd.read_csv(filename, header=0, usecols=names)
sa_traffic = sa_traffic.reindex(columns=names)
sa_traffic.head()

In [None]:
sa_traffic.shape

## Taking Care of missing data

In [None]:
# missing data counts
print(print(sa_traffic.isnull().sum()))

In [None]:
print(sa_traffic.dtypes)

In [None]:
from sklearn.impute import SimpleImputer
# handle string date column
s_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='No Data') # string or object values
s_imputer.fit(sa_traffic.iloc[:, 0:1])
sa_traffic.iloc[:, 0:1] = s_imputer.transform(sa_traffic.iloc[:, 0:1])

# handle string time column
s_time_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='00:00:00')
s_time_imputer.fit(sa_traffic.iloc[:, 1:2]) 
sa_traffic.iloc[:, 1:2] = s_time_imputer.transform(sa_traffic.iloc[:, 1:2])

# handle string ['location','substation'] columns
s_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='No Data')
s_imputer.fit(sa_traffic.iloc[:, 2:4]) 
sa_traffic.iloc[:, 2:4] = s_imputer.transform(sa_traffic.iloc[:, 2:4])

s_subject_age_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # only numerical values
s_subject_age_imputer.fit(sa_traffic.iloc[:, 4:5]) 
sa_traffic.iloc[:, 4:5] = s_subject_age_imputer.transform(sa_traffic.iloc[:, 4:5])

s_imputer.fit(sa_traffic.iloc[:, 5:8])
sa_traffic.iloc[:, 5:8] = s_imputer.transform(sa_traffic.iloc[:, 5:8])

In [None]:
pd.to_datetime(sa_traffic['date'] + ' ' + sa_traffic['time'])

In [None]:
# Verify no nulls remain
print(print(sa_traffic.isnull().sum()))

## Dataset Evaluation

In [None]:
sa_traffic.head()

In [None]:
sa_traffic.dtypes

In [None]:
# 8 characters from the end to 6 characters from the end
# this is quite a chore
sa_traffic.time.str.slice(-8, -6).astype(int).head()

In [None]:
# we will convert the Time column to datatime format
# there are many options to ensure this works well with your data
sa_traffic['time'] = pd.to_datetime(sa_traffic.time)
sa_traffic.head()

In [None]:
sa_traffic.dtypes

In [None]:
sa_traffic.time.dt.hour.head()

In [None]:
sa_traffic.time.dt.weekday.head()

In [None]:
sa_traffic.time.dt.dayofyear.head()

## Timestamps

In [None]:
ts = pd.to_datetime('1/1/2012')

In [None]:
sa_traffic.loc[sa_traffic.time >= ts, :].head()

In [None]:
sa_traffic.time.max()

In [None]:
sa_traffic.time.max() - sa_traffic.time.min()

In [None]:
(sa_traffic.time.max() - sa_traffic.time.min())

## Plotting

In [None]:
%matplotlib inline

In [None]:
sa_traffic.head()

In [None]:
subStation_vals = sa_traffic.substation.value_counts() # Provide a pandas series of the count of substations

# The chart should display the proportion of substations in the subStation_vals

(subStation_vals/sa_traffic.shape[0]).plot(kind="bar");
plt.title("Which substation recorded the most traffic stops");

In [None]:
location_vals = sa_traffic.location.value_counts() # Provide a pandas series of the count of locations

# The chart should display the proportion of the top 10 locations in the location_vals

(location_vals[:10]/sa_traffic.shape[0]).plot(kind="bar");
plt.title("Which location recorded the most traffic stops");

In [None]:
# The chart should display the proportion of age issued a violation

sa_traffic['subject_age'].hist();
plt.title("Which age was recorded the most during traffic stops");

In [None]:
sa_traffic['subject_age'].mode()

In [None]:
sa_traffic['subject_age'].median()

In [None]:
sa_traffic['subject_age'].mean()

In [None]:
#sa_traffic.subject_race.value_counts().plot()

race_vals = sa_traffic.subject_race.value_counts() # Provide a pandas series of the count of respective races

# The chart should display the proportion of the top 10 races in the race_vals

(race_vals[:10]/sa_traffic.shape[0]).plot(kind="bar");
plt.title("Which race was recorded the most during traffic stops");

In [None]:
sa_traffic[(sa_traffic["subject_race"] == "hispanic") & (sa_traffic["subject_sex"] == "male")].count()

In [None]:
gender_vals = sa_traffic.subject_sex.value_counts() # Provide a pandas series of the count of respective gender

# The chart should display the proportion of the gender in the gender_vals

(gender_vals[:10]/sa_traffic.shape[0]).plot(kind="bar");
plt.title("Which sex was recorded the most during traffic stops");

In [None]:
gender_vals = sa_traffic.subject_sex.value_counts() # Provide a pandas series of the count of respective gender

# The chart should display the proportion of the gender in the gender_vals

gender_vals/sa_traffic.shape[0] - sum(sa_traffic.subject_sex.isnull())

In [None]:
#sa_traffic.violation.value_counts().sort_index().plot()

violation_vals = sa_traffic.violation.value_counts() # Provide a pandas series of the count of respective violations

# The chart should display the proportion of the top 10 violations in the violation_vals

(violation_vals[:10]/sa_traffic.shape[0]).plot(kind="bar");
plt.title("Which violation was recorded the most during traffic stops");

In [None]:
# Get Continuous variables
sa_traffic.describe()

In [None]:
sa_traffic.hist();

In [None]:
sns.heatmap(sa_traffic.corr(), annot=True, fmt='.2f');

In [None]:
sa_traffic.columns

In [None]:
sa_traffic.tail()

In [None]:
#labelencoder_X = LabelEncoder()
#labelencoder_X.fit_transform(sa_traffic['subject_sex'])


In [None]:
sa_traffic.tail()

In [None]:
#sa_traffic.groupby("subject_race").size()
sa_traffic["subject_race"].value_counts()

In [None]:
sa_traffic.date.min()