In [2]:
import numpy as np
from pandas import DataFrame,Series
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

### Dataset Input

In [3]:
file=pd.read_csv("Traffic_Violations-API.csv")

In [4]:
pd.set_option("display.max_columns",100)

In [5]:
file.shape

(1581743, 35)

In [6]:
file.head()

Unnamed: 0,Date Of Stop,Time Of Stop,Agency,SubAgency,Description,Location,Latitude,Longitude,Accident,Belts,Personal Injury,Property Damage,Fatal,Commercial License,HAZMAT,Commercial Vehicle,Alcohol,Work Zone,State,VehicleType,Year,Make,Model,Color,Violation Type,Charge,Article,Contributed To Accident,Race,Gender,Driver City,Driver State,DL State,Arrest Type,Geolocation
0,09/17/2019,08:48:00,MCP,"6th District, Gaithersburg / Montgomery Village",STOP LIGHTS (*),N FREDERICK AVE / WATKINS MILL RD,39.158662,-77.218887,No,No,No,No,No,No,No,No,No,No,MD,02 - Automobile,2001.0,MERZ,4S,SILVER,ESERO,64*,,False,BLACK,F,GERMANTOWN,MD,MD,A - Marked Patrol,"(39.1586616666667, -77.2188866666667)"
1,09/17/2019,19:44:00,MCP,"1st District, Rockville",DRIVING VEHICLE ON HIGHWAY WITHOUT CURRENT REG...,INDIANOLA DR / CRABBS BRANCH WAY,39.114683,-77.156545,No,No,No,No,No,No,No,No,No,No,MD,02 - Automobile,2017.0,HONDA,PILOT,BLACK,Warning,13-411(d),Transportation Article,False,OTHER,F,DERWOOD,MD,MD,A - Marked Patrol,"(39.1146833333333, -77.156545)"
2,09/17/2019,11:27:00,MCP,"2nd District, Bethesda",EXCEEDING THE POSTED SPEED LIMIT OF 35 MPH,RIVER RD / BRAEBURN PKWY,38.978577,-77.130873,No,No,No,No,No,No,No,No,No,No,MD,02 - Automobile,2009.0,TOYOTA,PRIUS,GRAY,Warning,21-801.1,Transportation Article,False,WHITE,M,ROCKVILLE,MD,MD,A - Marked Patrol,"(38.9785766666667, -77.1308733333333)"
3,09/17/2019,20:10:00,MCP,"2nd District, Bethesda",DRIVING VEH. W/O ADEQUATE REAR REG. PLATE ILLU...,PARKLAWN DR / RANDOLPH RD,39.05362,-77.107472,No,No,No,No,No,No,No,No,No,No,MD,02 - Automobile,2013.0,HYUN,ELANTRA,SILVER,Warning,22-204(f),Transportation Article,False,BLACK,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.05362, -77.1074716666667)"
4,09/17/2019,16:44:00,MCP,"2nd District, Bethesda",DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...,POOKS HILL RD / LINDEN LA,39.013237,-77.106553,No,No,No,No,No,No,No,No,No,No,MD,02 - Automobile,2011.0,HONDA,4H,RED,Warning,21-201(a1),Transportation Article,False,WHITE,F,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0132366666667, -77.1065533333333)"


In [7]:
file.dtypes

Date Of Stop                object
Time Of Stop                object
Agency                      object
SubAgency                   object
Description                 object
Location                    object
Latitude                   float64
Longitude                  float64
Accident                    object
Belts                       object
Personal Injury             object
Property Damage             object
Fatal                       object
Commercial License          object
HAZMAT                      object
Commercial Vehicle          object
Alcohol                     object
Work Zone                   object
State                       object
VehicleType                 object
Year                       float64
Make                        object
Model                       object
Color                       object
Violation Type              object
Charge                      object
Article                     object
Contributed To Accident       bool
Race                

In [8]:
round((file.isnull().sum()/file.shape[0])*100,2)

Date Of Stop               0.00
Time Of Stop               0.00
Agency                     0.00
SubAgency                  0.00
Description                0.00
Location                   0.00
Latitude                   0.00
Longitude                  0.00
Accident                   0.00
Belts                      0.00
Personal Injury            0.00
Property Damage            0.00
Fatal                      0.00
Commercial License         0.00
HAZMAT                     0.00
Commercial Vehicle         0.00
Alcohol                    0.00
Work Zone                  0.00
State                      0.00
VehicleType                0.00
Year                       0.61
Make                       0.00
Model                      0.01
Color                      1.18
Violation Type             0.00
Charge                     0.00
Article                    4.87
Contributed To Accident    0.00
Race                       0.00
Gender                     0.00
Driver City                0.02
Driver S

#### Null values are less than 1 percent

In [9]:
file['Date Of Stop_format'] = file['Date Of Stop'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))

In [10]:
file.drop("Date Of Stop", axis=1, inplace=True)

In [11]:
years=file['Date Of Stop_format'].apply(lambda x: datetime.strftime(x, "%Y"))

In [12]:
years.unique()

array(['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012'],
      dtype=object)

### We have data from 2012 to till date

### Spliting data based on the year

In [13]:
df12=file[(file['Date Of Stop_format']> "2011-12-31") & (file['Date Of Stop_format']<"2013-01-01")].copy()
df13=file[(file['Date Of Stop_format']> "2012-12-31") & (file['Date Of Stop_format']<"2014-01-01")].copy()
df14=file[(file['Date Of Stop_format']> "2013-12-31") & (file['Date Of Stop_format']<"2015-01-01")].copy()
df15=file[(file['Date Of Stop_format']> "2014-12-31") & (file['Date Of Stop_format']<"2016-01-01")].copy()
df16=file[(file['Date Of Stop_format']> "2015-12-31") & (file['Date Of Stop_format']<"2017-01-01")].copy()
df17=file[(file['Date Of Stop_format']> "2016-12-31") & (file['Date Of Stop_format']<"2018-01-01")].copy()
df18=file[(file['Date Of Stop_format']> "2017-12-31") & (file['Date Of Stop_format']<"2019-01-01")].copy()
df19=file[(file['Date Of Stop_format']> "2018-12-31") & (file['Date Of Stop_format']<"2020-01-01")].copy()

df12.to_csv('Traffic_violations_12.csv',index=False)
df13.to_csv('Traffic_violations_13.csv',index=False)
df14.to_csv('Traffic_violations_14.csv',index=False)
df15.to_csv('Traffic_violations_15.csv',index=False)
df16.to_csv('Traffic_violations_16.csv',index=False)
df17.to_csv('Traffic_violations_17.csv',index=False)
df18.to_csv('Traffic_violations_18.csv',index=False)
df19.to_csv('Traffic_violations_19.csv',index=False)

### Sampling

In [14]:
sample_of_2012     = df12.sample(frac=0.1,random_state=100,axis=0)
sample_of_2013     = df13.sample(frac=0.1,random_state=100,axis=0)
sample_of_2014     = df14.sample(frac=0.1,random_state=100,axis=0)
sample_of_2015     = df15.sample(frac=0.1,random_state=100,axis=0)
sample_of_2016     = df16.sample(frac=0.1,random_state=100,axis=0)
sample_of_2017     = df17.sample(frac=0.1,random_state=100,axis=0)
sample_of_2018     = df18.sample(frac=0.1,random_state=100,axis=0)
sample_of_2019     = df19.sample(frac=0.1,random_state=100,axis=0)

In [15]:
sample = pd.concat([sample_of_2012,sample_of_2013,sample_of_2014,
sample_of_2015,sample_of_2016,sample_of_2017,sample_of_2018,sample_of_2019],axis=0,ignore_index=True)

In [16]:
sample.shape

(158175, 35)

In [17]:
import scipy.stats as stats

#### Accident - Test of proportion

In [18]:
x1 = file[file['Accident']=='Yes']['Accident'].value_counts()[0]/len(file)
x2 = sample[sample['Accident']=='Yes']['Accident'].value_counts()[0]/len(sample)
# implementation ztest_proportion_two_classes
def ztest_proportion_two_classes(x1, n1, x2, n2, one_sided=False):
    p1 = x1/n1
    p2 = x2/n2    

    p = (x1*n1+x2*n2)/(n1+n2)    #Pooled sample proportion
    se = p*(1-p)*(1/n1+1/n2)
    se = np.sqrt(se)
    
    z = (p1-p2)/se   #Test statistic
    
    LCI, UCI = stats.norm.interval(0.95) 
    print(LCI,UCI)
    
    p = 1-stats.norm.cdf(abs(z))
    p *= 2-one_sided # if not one_sided: p *= 2
    return z, p

z,p = ztest_proportion_two_classes(x1, len(file), x2, len(sample), one_sided=False)
print(' z-stat = {z} \n p-value = {p}'.format(z=z,p=p))

print('Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.')

-1.959963984540054 1.959963984540054
 z-stat = -0.0003333279115861409 
 p-value = 0.9997340428105856
Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.


#### Property Damage - Test of proportion

In [19]:
x1 = file[file['Property Damage']=='Yes']['Property Damage'].value_counts()[0]/len(file)
x2 = sample[sample['Property Damage']=='Yes']['Property Damage'].value_counts()[0]/len(sample)
# implementation ztest_proportion_two_classes
def ztest_proportion_two_classes(x1, n1, x2, n2, one_sided=False):
    p1 = x1/n1
    p2 = x2/n2    

    p = (x1*n1+x2*n2)/(n1+n2)    #Pooled sample proportion
    se = p*(1-p)*(1/n1+1/n2)
    se = np.sqrt(se)
    
    z = (p1-p2)/se   #Test statistic
    
    LCI, UCI = stats.norm.interval(0.95) 
    print(LCI,UCI)
    
    p = 1-stats.norm.cdf(abs(z))
    p *= 2-one_sided # if not one_sided: p *= 2
    return z, p

z,p = ztest_proportion_two_classes(x1, len(file), x2, len(sample), one_sided=False)
print(' z-stat = {z} \n p-value = {p}'.format(z=z,p=p))

print('Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.')

-1.959963984540054 1.959963984540054
 z-stat = -0.0002939929646562065 
 p-value = 0.9997654275558951
Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.


####   Violation Type - Test of proportion

In [20]:
x1 = file[file['Violation Type']=='Warning']['Violation Type'].value_counts()[0]/len(file)
x2 = sample[sample['Violation Type']=='Warning']['Violation Type'].value_counts()[0]/len(sample)
# implementation ztest_proportion_two_classes
def ztest_proportion_two_classes(x1, n1, x2, n2, one_sided=False):
    p1 = x1/n1
    p2 = x2/n2    

    p = (x1*n1+x2*n2)/(n1+n2)    #Pooled sample proportion
    se = p*(1-p)*(1/n1+1/n2)
    se = np.sqrt(se)
    
    z = (p1-p2)/se   #Test statistic
    
    LCI, UCI = stats.norm.interval(0.95) 
    print(LCI,UCI)
    
    p = 1-stats.norm.cdf(abs(z))
    p *= 2-one_sided # if not one_sided: p *= 2
    return z, p

z,p = ztest_proportion_two_classes(x1, len(file), x2, len(sample), one_sided=False)
print(' z-stat = {z} \n p-value = {p}'.format(z=z,p=p))

print('Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.')

-1.959963984540054 1.959963984540054
 z-stat = -0.002163107456595316 
 p-value = 0.998274091302956
Since the P-value is greater than the significance level (0.05), we are fail to reject the null hypothesis.


In [37]:
sample.to_csv('Traffic_violations_sample.csv',index=False)