In [None]:
# Import CSV to DataSet
import pandas

from config_secret import DATA_DIR

accidents_data_file = '/Stats19-Data1979-2004/Accidents7904.csv'
accidents_file = DATA_DIR + accidents_data_file

accidents = pandas.read_csv(
    accidents_file,
    sep=',',
    header=0,
    index_col=False,
    parse_dates=['Date'],
    dayfirst=True,
    tupleize_cols=False,
    error_bad_lines=True,
    warn_bad_lines=True,
    skip_blank_lines=True,
    nrows=10000,
)

In [None]:
# Create report columns

data_types = pandas.DataFrame(
    accidents.dtypes,
    columns=['Data Type']
)

missing_data_counts = pandas.DataFrame(
    accidents.isnull().sum(),
    columns=['Missing Values']
)

present_data_counts = pandas.DataFrame(
    accidents.count(),
    columns=['Present Values']
)

unique_value_counts = pandas.DataFrame(columns=['Unique Values'])
for v in list(accidents.columns.values):
    unique_value_counts.loc[v] = [accidents[v].nunique()]

minimum_values = pandas.DataFrame(columns=['Minimum Value'])
for v in list(accidents.columns.values):
    try: 
        minimum_values.loc[v] = [accidents[v].min()]
    except:
        pass
    
maximum_values = pandas.DataFrame(columns=['Maximum Value'])
for v in list(accidents.columns.values):
    try:
        maximum_values.loc[v] = [accidents[v].max()]
    except:
        pass


In [None]:
# Merge all DataFrames

data_quality_report = data_types.join(present_data_counts).join(
    missing_data_counts).join(unique_value_counts).join(
    minimum_values).join(maximum_values)

print('\nDATA QUALITY REPORT')
print('Total records: {}'.format(len(accidents.index)))
data_quality_report

In [None]:
accidents.describe().transpose()

In [None]:
accidents.describe(include=['object']).transpose()

In [None]:
# Get the mode of the columns
accidents.mode().transpose()

In [None]:
# Generate frequency table for a single column
import numpy

casualty_count = accidents.groupby('Date').agg(
    {'Number_of_Casualties': numpy.sum})
casualty_count

In [None]:
# Add another column
vehicle_count = accidents.groupby('Date').agg(
    {'Number_of_Vehicles': numpy.sum})
vehicle_count

In [None]:
casualties_and_vehicles = casualty_count.merge(
    vehicle_count, left_index=True, right_index=True)
casualties_and_vehicles

In [None]:
# Create histogram
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

plt.hist(casualty_count['Number_of_Casualties'], bins=30)
plt.title('Number of Casualties Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot two sets of values
plt.hist(casualty_count['Number_of_Casualties'], bins=20,
        histtype='stepfilled', normed=True, color='b', 
        label='Casualties')
plt.hist(vehicle_count['Number_of_Vehicles'], bins=20, 
        histtype='stepfilled', normed=True, color='r',
        label='Vehicles')
plt.title('Casualties/Vehicles Histogram')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.legend()
plt.show()

