-
Notifications
You must be signed in to change notification settings - Fork 3
/
02_EDA.py
45 lines (36 loc) · 1.38 KB
/
02_EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
# 02_EDA.py
# Jes Simkin and Alex Hope Nov, 2018
#
# This script produces histograms for each decision tree feature.
#
# Usage
# Inputs: tidy csv path, output path with filename prefix, player name string
# Outputs: png histograms for each feature in the data set
# Example: Python src/02_EDA.py data/tidy_data_lebron_james.csv results/figs/EDA "lebron_james"
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument('input_file')
parser.add_argument('output_file')
parser.add_argument('player')
args = parser.parse_args()
def main():
#read in data and set up feature list for iterating
data = pd.read_csv(args.input_file).drop(['Unnamed: 0'], axis=1)
features = list(data)
#add an under score for file slug
name_underscored= args.player.replace(" ", "_")
#create histogram for each feature and save png
for feature in features:
missed = data[data.SHOT_RESULT == "missed"][feature]
made = data[data.SHOT_RESULT == "made"][feature]
fig, share = plt.subplots()
plt.hist([missed, made], color=['darkorange', 'royalblue'], label=['Missed','Made'], bins = 15)
plt.legend(title= "Shot Result")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.savefig(args.output_file+"_"+feature+"_"+name_underscored)
main()