In [None]:
import pandas as pd
import scipy.stats

df_mt = pd.read_csv("MT-clean.csv")
df_vt = pd.read_csv("VT-clean.csv")

## 1. Proportion of male driver
probortion_male = df_mt.driver_gender.value_counts()["M"]*1.0/len(df_mt)
print("Proportion of male driver: {0:12.10f}.".format(probortion_male))

## 2. How many more times likely to be arrested if you have out of state plates
group_os = df_mt.groupby(["out_of_state"])
os_t = group_os.get_group(True)["is_arrested"].value_counts()
os_f = group_os.get_group(False)["is_arrested"].value_counts()

print("{0:12.10f} times more likely.".format((os_t[True]/os_t[False])/(os_f[True]/os_f[False]))) # Odds radio

## 3. Chi2 test of two populations 
## two populations are not clear, which can be out-of-state and not out-of-state or male and female
## I assume two populations are out-of-state and not out-of-state followed by the previous question
obs = np.array([os_t.values, os_f.values])
value = scipy.stats.chi2_contingency(obs)[0]
print("The value of the test statistic: {0:10.7f}.".format(value))

## 4. Proportion of traffic stops in speeding violations
probortion_speed = df_mt.violation.str.contains("Speeding").value_counts()[True]/len(df_mt)
print("Proportion of speeding violations: {0:12.10f}".format(probortion_speed))

## 5. How much more likely result in a DUI in Montana than in Vermont
prob_dui_mt = df_mt.violation.str.contains("DUI").value_counts()[True]/len(df_mt)
prob_dui_vt = df_vt.violation.str.contains("DUI").value_counts()[True]/len(df_vt)
print("{0:12.10f} more likely.".format(prob_dui_mt/prob_dui_vt))

## 6, 7. The extrapolated, average manufacture year of vehicles in Montana in 2020 and the p-value
df_mt["year"] = pd.to_datetime(df_mt.stop_date).dt.year
year_clean = df_mt[~df_mt.vehicle_year.isin(['NON-','UNK'])].dropna(subset=["vehicle_year"])
year_clean["vehicle_year"] = year_clean.vehicle_year.astype(int)
year_avg = year_clean.groupby(["year"])["vehicle_year"].mean()
x = year_avg.index.values
y = year_avg.values
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x,y)

print("The average manufacture year of vehicles in 2020: {0:10.6f}.".format(slope*2020+intercept))
print("p-value: {0:12.10f}.".format(p_value))

## 8. The difference of stops that occurred in two hours when most and least number of traffic stops occurred
df_mt["hour"] = pd.to_datetime(df_mt.stop_time, format='%H:%M').dt.hour
hour_total_mt = df_mt.groupby(["hour"])["id"].count()
df_vt["hour"] = pd.to_datetime(df_vt.stop_time, format='%H:%M').dt.hour
hour_total_vt = df_vt.groupby(["hour"])["id"].count()

hour_total = hour_total_mt + hour_total_vt
diff_total = hour_total.max() - hour_total.min()
print("The difference of two hours: {0:10.5f}".format(diff_total))

## 9. What is the area, in square kilometers, of the largest county.
import math
df_ll = df_mt[["county_name", "lat", "lon"]]
df_ll = df_ll[df_ll["lat"] < 50]
df_ll = df_ll[df_ll["lat"] > 40]
df_ll["lon"] = abs(df_ll["lon"])
df_ll = df_ll[df_ll["lon"] > 100]
df_ll = df_ll.dropna()

gp_cn = df_ll.groupby(["county_name"])
keys = gp_cn.groups.keys()

semi_axis = dict()
conv_lat = 110.574
conv_lon = 111.320*math.cos(math.radians(np.mean(df_ll.lat))) # I use the average lattice

for key in keys:
    gp = gp_cn.get_group(key)
    semi_axis[key] = [np.std(gp["lat"]), np.std(gp["lon"]), math.pi*np.std(gp["lat"])*conv_lat*np.std(gp["lon"])*conv_lon]

result = pd.DataFrame(semi_axis).T
print("The largest area is {0:10.6f} km2.".format(result.max()[2]))  ## seems far away from the reality