In [1]:
# import libraries

import pandas as pd
pd.set_option('display.max_rows', 1000)

import sqlite3

In [2]:
# read in file and create dataframe
df = pd.read_csv('Assignment Log.csv')

### Part 1: Analysis 
#### Analyze and extract any interesting insights you can derive from the data set attached (each row represents the assignment of a job in our research queue, including some data about the analyst who received the assignment and the current state of the research queue). What can you infer? What do you think this means for us from a business perspective? 

### Part 2: Data modeling 
#### Assuming we are starting with the data set from Part 1 as our raw data table, how would you model this data in a data warehouse for analytical purposes? What tables would you create? What kinds of questions do you imagine business users would want to ask of this data, and how would they express them in your data model? Please use whatever tools you are comfortable with to answer this question and whatever flavor of SQL you are most familiar with. A github repo or gist is preferred. 

### Exploratory Data Analysis

In [3]:
df.head(10)

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
0,06/22/2017 19:59:06,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Accepted Job,594bec5c95e2ce005840c23a,06/22/2017 12:12:12,review,1,review,0,13,14,4,6,2,0,1,1
1,06/22/2017 19:59:02,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Assigned Job,594bec5c95e2ce005840c23a,06/22/2017 12:12:12,review,1,review,1,13,15,5,6,2,0,1,1
2,06/22/2017 19:51:30,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Declined Job,594bec83fd2cf400280aa965,06/22/2017 12:12:51,writing,9,"sourcing, writing",1,12,12,5,5,1,0,0,1
3,06/22/2017 19:51:01,0e9802516f8a79dd0d45211dd4ee74af,4.5,4.5,Accepted Job,594c1f5cd7e68f0028c9062c,06/22/2017 15:49:48,sourcing,1,"sourcing, writing",1,11,12,5,5,1,0,0,1
4,06/22/2017 19:50:58,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594bec83fd2cf400280aa965,06/22/2017 12:12:51,writing,8,"sourcing, writing",2,11,14,5,5,2,0,1,1
5,06/22/2017 19:50:58,0e9802516f8a79dd0d45211dd4ee74af,4.5,4.5,Assigned Job,594c1f5cd7e68f0028c9062c,06/22/2017 15:49:48,sourcing,1,"sourcing, writing",2,11,14,5,5,2,0,1,1
6,06/22/2017 19:50:58,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Declined Job,594bec83fd2cf400280aa965,06/22/2017 12:12:51,writing,8,"sourcing, writing",2,11,14,5,5,2,0,1,1
7,06/22/2017 19:50:23,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594bec83fd2cf400280aa965,06/22/2017 12:12:51,writing,8,"sourcing, writing",1,11,14,5,5,2,0,1,1
8,06/22/2017 19:47:03,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Accepted Job,594c1e983b593b00281250ba,06/22/2017 15:46:32,sourcing,4,"sourcing, writing",1,11,11,5,5,1,0,0,0
9,06/22/2017 19:47:03,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Accepted Job,594c1e983b593b00281250ba,06/22/2017 15:46:32,sourcing,4,"sourcing, writing",1,11,11,5,5,1,0,0,0


In [4]:
df.shape

(791, 19)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 791 entries, 0 to 790
Data columns (total 19 columns):
Event occurred at           791 non-null object
Analyst                     791 non-null object
Quality score (sourcing)    791 non-null float64
Quality score (writing)     791 non-null float64
Action                      791 non-null object
Request                     791 non-null object
Request created at          791 non-null object
Job                         791 non-null object
Wait time (min)             791 non-null int64
Waiting for                 785 non-null object
Analysts available          791 non-null int64
Analysts occupied           791 non-null int64
Total jobs available        791 non-null int64
Review jobs available       791 non-null int64
Vetting jobs available      791 non-null int64
Planning jobs available     791 non-null int64
Editing jobs available      791 non-null int64
Sourcing jobs available     791 non-null int64
Writing jobs available      791 non-nu

In [6]:
# convert Event occurred at and Request created at to datetime
df['Event occurred at'] = pd.to_datetime(df['Event occurred at'])
df['Request created at'] = pd.to_datetime(df['Request created at'])

In [7]:
# Min and Max times for creating requests

print('Min Date: ' + str(min(df['Request created at'])))
print('Max Date: ' + str(max(df['Request created at'])))

Min Date: 2017-06-19 13:43:51
Max Date: 2017-06-22 18:16:11


In [8]:
# Min and Max times for events

print('Min Date: ' + str(min(df['Event occurred at'])))
print('Max Date: ' + str(max(df['Event occurred at'])))

Min Date: 2017-06-21 20:15:42
Max Date: 2017-06-22 19:59:06


#### Handling Nulls

In [9]:
# check for any null values
df.isnull().sum()

Event occurred at           0
Analyst                     0
Quality score (sourcing)    0
Quality score (writing)     0
Action                      0
Request                     0
Request created at          0
Job                         0
Wait time (min)             0
Waiting for                 6
Analysts available          0
Analysts occupied           0
Total jobs available        0
Review jobs available       0
Vetting jobs available      0
Planning jobs available     0
Editing jobs available      0
Sourcing jobs available     0
Writing jobs available      0
dtype: int64

In [10]:
# find nulls
def find_nulls(df):
    null_columns=df.columns[df.isnull().any()]
    return df[df['Waiting for'].isnull()][null_columns]

In [11]:
find_nulls(df)

Unnamed: 0,Waiting for
195,
340,
392,
454,
595,
723,


In [12]:
df.loc[194:196]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
194,2017-06-22 14:31:08,94959cf2d0b592d0fa1e5e9cf760a1c7,5.0,5.0,Assigned Job,594bec81fd2cf400280aa95c,2017-06-22 12:12:49,planning,1,planning,5,16,8,0,3,5,0,0,0
195,2017-06-22 14:30:59,9fa24ddce8fd9d1526d9d7451304fc74,4.89,4.89,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,26,,5,15,8,0,3,5,0,0,0
196,2017-06-22 14:30:54,9fa24ddce8fd9d1526d9d7451304fc74,4.89,4.89,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,26,sourcing,5,15,8,0,3,5,0,0,0


In [13]:
df.loc[[195], 'Waiting for'] = 'sourcing'

In [14]:
df.loc[339:341]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
339,2017-06-22 08:03:23,98de7e62209a07eee6ec8dc984911042,0.0,0.0,Assigned Job,594a9ce6c864e200288e766b,2017-06-21 12:20:54,sourcing,1,sourcing,4,10,4,0,0,1,0,1,2
340,2017-06-22 08:01:25,a29fe6d26c1a49ff4a3c876eaee0a1af,0.0,0.0,Accepted Job,594a9ce6c864e200288e766b,2017-06-21 12:20:54,sourcing,1,,3,10,3,0,0,1,0,0,2
341,2017-06-22 08:01:22,a29fe6d26c1a49ff4a3c876eaee0a1af,0.0,0.0,Accepted Job,594a9ce6c864e200288e766b,2017-06-21 12:20:54,sourcing,1,sourcing,3,10,3,0,0,1,0,0,2


In [15]:
df.loc[[340], 'Waiting for'] = 'sourcing'

In [16]:
df.loc[391:393]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
391,2017-06-22 06:17:54,7e22ad15724c44543d1d4bcafd10c812,5.0,5.0,Assigned Job,594ae6fb4f06e20035d92345,2017-06-21 17:36:59,review,1,review,1,10,6,3,1,0,0,0,2
392,2017-06-22 06:13:40,766d3435eda76c4de9f034b8f97a0602,3.5,3.5,Accepted Job,594a8fd422b3c50028e71976,2017-06-21 11:25:08,writing,1,,0,9,6,3,1,0,0,0,2
393,2017-06-22 06:13:34,766d3435eda76c4de9f034b8f97a0602,3.5,3.5,Assigned Job,594a8fd422b3c50028e71976,2017-06-21 11:25:08,writing,1,"sourcing, writing",1,9,7,3,1,0,0,0,3


In [17]:
df.loc[[392], 'Waiting for'] = 'sourcing, writing'

In [18]:
df.loc[453:455]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
453,2017-06-22 04:45:27,7e22ad15724c44543d1d4bcafd10c812,5.0,5.0,Assigned Job,594b00e23f82d10028f03be2,2017-06-21 19:27:30,review,1,review,1,14,7,3,1,0,0,0,3
454,2017-06-22 04:40:07,7e22ad15724c44543d1d4bcafd10c812,5.0,5.0,Accepted Job,594af3c6f3b9f600287eabe0,2017-06-21 18:31:34,review,1,,0,15,7,3,1,0,0,0,3
455,2017-06-22 04:40:03,7e22ad15724c44543d1d4bcafd10c812,5.0,5.0,Assigned Job,594af3c6f3b9f600287eabe0,2017-06-21 18:31:34,review,1,review,1,15,8,4,1,0,0,0,3


In [19]:
df.loc[[454], 'Waiting for'] = 'review'

In [20]:
df.loc[594:596]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
594,2017-06-21 23:26:17,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Assigned Job,594b20ed3f82d10028f03c6c,2017-06-21 21:44:13,sourcing,1,sourcing,1,24,7,1,3,1,0,1,1
595,2017-06-21 23:23:02,f7f7591403c6c431053920223069550a,5.0,5.0,Accepted Job,594b20ee3f82d10028f03c74,2017-06-21 21:44:14,planning,1,,0,23,7,1,3,1,0,1,1
596,2017-06-21 23:22:37,36ee9fc3bade4a4f71c2a6e5c2bd8862,0.0,0.0,Accepted Job,594b20eb3f82d10028f03c5c,2017-06-21 21:44:11,sourcing,1,sourcing,1,22,8,1,3,2,0,1,1


In [21]:
df.loc[[595], 'Waiting for'] = 'planning'

In [22]:
df.loc[722:724]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
722,2017-06-21 21:25:06,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Assigned Job,5949462dd9ae5200633f9640,2017-06-20 11:58:37,review,1,review,2,21,8,6,1,0,0,0,1
723,2017-06-21 21:24:44,206de922289a1f9f5ee250fc71308628,3.17,3.17,Accepted Job,594ae6fb4f06e20035d92345,2017-06-21 17:36:59,writing,1,,1,20,8,6,1,0,0,0,1
724,2017-06-21 21:24:38,206de922289a1f9f5ee250fc71308628,3.17,3.17,Accepted Job,594ae6fb4f06e20035d92345,2017-06-21 17:36:59,writing,1,writing,1,20,8,6,1,0,0,0,1


In [23]:
df.loc[[723], 'Waiting for'] = 'writing'

In [24]:
# check for any null values
df.isnull().sum()

Event occurred at           0
Analyst                     0
Quality score (sourcing)    0
Quality score (writing)     0
Action                      0
Request                     0
Request created at          0
Job                         0
Wait time (min)             0
Waiting for                 0
Analysts available          0
Analysts occupied           0
Total jobs available        0
Review jobs available       0
Vetting jobs available      0
Planning jobs available     0
Editing jobs available      0
Sourcing jobs available     0
Writing jobs available      0
dtype: int64

#### Handling Duplicates

In [25]:
# find duplicates
# select duplicate rows except first occurrence based on all columns
def duplicates(df):
    duplicate_rows_df = df[df.duplicated()]
    return duplicate_rows_df

In [26]:
duplicates(df)

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
9,2017-06-22 19:47:03,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Accepted Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,sourcing,4,"sourcing, writing",1,11,11,5,5,1,0,0,0
55,2017-06-22 18:26:42,e2333b2dc03032f12c8526e45243f0c1,0.0,0.0,Accepted Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,22,sourcing,2,14,9,2,7,0,0,0,0


In [27]:
df.drop_duplicates(inplace = True)

In [28]:
# confirm duplicates removed
duplicates(df)

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available


In [29]:
# inspecting the unique values found in columns
df.nunique()

Event occurred at           669
Analyst                      71
Quality score (sourcing)     37
Quality score (writing)      37
Action                        3
Request                      74
Request created at           72
Job                           7
Wait time (min)              39
Waiting for                  27
Analysts available            9
Analysts occupied            19
Total jobs available         17
Review jobs available        12
Vetting jobs available        8
Planning jobs available      13
Editing jobs available        3
Sourcing jobs available       5
Writing jobs available        7
dtype: int64

In [30]:
def values(column):
    print(df[column].value_counts())

In [31]:
print(values('Action'))
print(values('Job'))

Assigned Job    393
Accepted Job    301
Declined Job     95
Name: Action, dtype: int64
None
sourcing         209
writing          160
review           143
vetting           94
planning          94
source review     58
editing           31
Name: Job, dtype: int64
None


In [32]:
df[df['Request'] == '594c1e983b593b00281250ba']

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
8,2017-06-22 19:47:03,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Accepted Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,sourcing,4,"sourcing, writing",1,11,11,5,5,1,0,0,0
10,2017-06-22 19:46:56,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,sourcing,4,"sourcing, writing",2,12,13,5,5,2,0,1,0
104,2017-06-22 16:02:15,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,vetting,1,"vetting, planning, editing, sourcing, writing",0,19,3,2,1,0,0,0,0
105,2017-06-22 16:02:11,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,vetting,1,"vetting, planning, editing, sourcing, writing",1,19,4,2,2,0,0,0,0


In [33]:
df[df['Request'] == '594bec7ffd2cf400280aa953']

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
188,2017-06-22 14:33:36,d5aa695af013ee5baaeeccfb7ca50c80,4.7,4.7,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,1,"editing, sourcing, writing",5,14,7,0,3,4,0,0,0
189,2017-06-22 14:33:35,d5aa695af013ee5baaeeccfb7ca50c80,4.7,4.7,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,1,"editing, sourcing, writing",5,14,7,0,3,4,0,0,0
190,2017-06-22 14:33:29,d5aa695af013ee5baaeeccfb7ca50c80,4.7,4.7,Assigned Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,1,"editing, sourcing, writing",6,14,8,0,3,4,0,1,0
195,2017-06-22 14:30:59,9fa24ddce8fd9d1526d9d7451304fc74,4.89,4.89,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,26,sourcing,5,15,8,0,3,5,0,0,0
196,2017-06-22 14:30:54,9fa24ddce8fd9d1526d9d7451304fc74,4.89,4.89,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,26,sourcing,5,15,8,0,3,5,0,0,0
197,2017-06-22 14:30:33,9fa24ddce8fd9d1526d9d7451304fc74,4.89,4.89,Assigned Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,sourcing,26,sourcing,5,15,9,0,3,5,0,1,0
214,2017-06-22 13:32:46,94959cf2d0b592d0fa1e5e9cf760a1c7,5.0,5.0,Accepted Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,planning,1,planning,4,12,9,0,1,8,0,0,0
215,2017-06-22 13:32:40,94959cf2d0b592d0fa1e5e9cf760a1c7,5.0,5.0,Assigned Job,594bec7ffd2cf400280aa953,2017-06-22 12:12:47,planning,1,planning,5,12,10,0,1,9,0,0,0


In [34]:
df.describe()

Unnamed: 0,Quality score (sourcing),Quality score (writing),Wait time (min),Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
count,789.0,789.0,789.0,789.0,789.0,789.0,789.0,789.0,789.0,789.0,789.0,789.0
mean,3.939759,3.939759,5.079848,2.676806,15.2218,8.04943,2.835234,1.617237,1.740177,0.087452,0.386565,1.382763
std,1.867008,1.867008,12.154044,1.871705,4.871728,3.647033,2.80513,1.61387,2.844475,0.295838,0.620195,1.548048
min,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.43,4.43,1.0,1.0,11.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,4.83,4.83,1.0,2.0,14.0,8.0,2.0,1.0,0.0,0.0,0.0,1.0
75%,5.0,5.0,2.0,4.0,20.0,11.0,5.0,2.0,2.0,0.0,1.0,2.0
max,5.0,5.0,107.0,8.0,25.0,16.0,11.0,7.0,12.0,2.0,4.0,6.0


In [35]:
# Min and Max times for creating requests

print('Min quality score (sourcing) : ' + str(min(df['Quality score (sourcing)'])))
print('Max quality score (sourcing) : ' + str(max(df['Quality score (sourcing)'])))

Min quality score (sourcing) : 0.0
Max quality score (sourcing) : 5.0


In [36]:
df[df['Quality score (sourcing)'] == 0]

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
24,2017-06-22 19:15:06,8258591d197a3936de0f7ee021ab9e0e,0.0,0.0,Accepted Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,1,sourcing,1,15,9,3,5,1,0,0,0
27,2017-06-22 19:14:53,8258591d197a3936de0f7ee021ab9e0e,0.0,0.0,Assigned Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,1,sourcing,3,14,12,3,6,2,0,1,0
28,2017-06-22 19:09:03,00360b3f177375b01b795a4be7b4686c,0.0,0.0,Declined Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,107,sourcing,2,14,11,2,6,2,0,1,0
29,2017-06-22 19:08:28,00360b3f177375b01b795a4be7b4686c,0.0,0.0,Assigned Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,106,sourcing,2,14,11,2,6,2,0,1,0
30,2017-06-22 19:08:28,00360b3f177375b01b795a4be7b4686c,0.0,0.0,Declined Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,106,sourcing,2,14,11,2,6,2,0,1,0
33,2017-06-22 19:07:53,00360b3f177375b01b795a4be7b4686c,0.0,0.0,Assigned Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,106,sourcing,3,13,12,3,6,2,0,1,0
34,2017-06-22 19:06:14,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Accepted Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,6,sourcing,2,13,10,3,6,1,0,0,0
35,2017-06-22 19:06:08,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,6,sourcing,3,13,12,3,6,2,0,1,0
40,2017-06-22 18:59:12,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Accepted Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,5,sourcing,2,12,10,3,7,0,0,0,0
41,2017-06-22 18:59:07,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Assigned Job,594c0b883b593b002812506e,2017-06-22 14:25:12,sourcing,4,sourcing,4,12,12,3,7,1,0,1,0


In [37]:
df[df['Action'] == 'Declined Job'].describe()

Unnamed: 0,Quality score (sourcing),Quality score (writing),Wait time (min),Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
count,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0
mean,3.294211,3.294211,12.294737,3.494737,16.431579,7.842105,2.8,2.031579,1.463158,0.031579,0.536842,0.978947
std,2.205641,2.205641,20.63661,1.934423,5.037419,3.41248,2.611757,1.932106,2.220842,0.175804,0.649231,1.263051
min,0.0,0.0,1.0,1.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,2.0,13.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,4.5,4.5,3.0,3.0,17.0,8.0,2.0,1.0,1.0,0.0,0.0,0.0
75%,4.93,4.93,14.5,5.0,20.0,10.0,5.0,3.0,2.0,0.0,1.0,2.0
max,5.0,5.0,107.0,8.0,25.0,14.0,9.0,7.0,12.0,1.0,3.0,5.0


In [38]:
df.groupby('Analyst')['Request'].count()

Analyst
00360b3f177375b01b795a4be7b4686c     8
008fa95f5c94985e2d44047aeac31655     7
052066814364850e1a17a90d576dd904     3
0c2680433387fb4cf51a3546296f8422    18
0e9802516f8a79dd0d45211dd4ee74af    24
1f8d8b08a994331e419d24293ce9e7b0     4
206de922289a1f9f5ee250fc71308628     7
2d0f64a208f69a70ec415cfccb59cc24     4
2df8ce7d317c7d89dfa95be7695d2de0     8
30ea4ac9222bfa53b3849df49d6f26f0     2
32553d485988e2ddcf0f701c60754f2d     4
36ee9fc3bade4a4f71c2a6e5c2bd8862     4
39012c98c8fb80752d2bbcc3dc285230     8
39b71201151caf98fd10afce50cfc83e    12
3a29165b73c047e745934c2cbbfaac1b     4
3a8a0b0323b02a707535fe7d65821de3     2
482e9b14d8d59b748320f89074362746     8
4bb6e496f0c92286dee7eed7038ddd9d     8
5078cc506436a8668c0b6de594842fcf     1
5233d783ae5bf0483cf49549cbc594f8     2
5238b0785b5576f5ffe665136df9244f     4
57ef46a92ecee8e187f4d450e92651ea     2
608b40f66ef114c6d9f5c95021290f69     4
62060850630c7afe54fd59151413d237    46
632a6492e9ff20cc4a442245836424e5     6
642782c690c8d963c

In [39]:
df.groupby(['Request', 'Analyst'])['Wait time (min)'].sum()

Request                   Analyst                         
59480d57e759070028da6467  a09c8906073b4c0b75e3100b857b982a      2
                          d8e25a290ea51352bf9100a99c475f6d      7
59485c262e71030033104e3c  642782c690c8d963c487300a4751e220      2
                          7e22ad15724c44543d1d4bcafd10c812      2
                          a09c8906073b4c0b75e3100b857b982a      2
                          b599bfb42906772db81ac90137fc1916      2
                          e817dd4305458b293cbeb3015da99565      2
59486e7af5874900429ce273  9fcbc63ff4c8bea5cea4efad782c87cf      2
5949462dd9ae5200633f9640  39012c98c8fb80752d2bbcc3dc285230      2
                          9fa24ddce8fd9d1526d9d7451304fc74     10
                          9fcbc63ff4c8bea5cea4efad782c87cf      2
59494c13d9ae5200633f9695  a09c8906073b4c0b75e3100b857b982a      2
5949a81c4d1319005556396c  39012c98c8fb80752d2bbcc3dc285230      4
                          632a6492e9ff20cc4a442245836424e5      2
                 

In [40]:
df[df['Request'] == '594c151273dd9c002873cd98']

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
17,2017-06-22 19:31:18,c033808b89b50e6f550caf266470f090,4.9,4.9,Accepted Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,1,"editing, sourcing, writing",2,10,11,5,5,1,0,0,0
18,2017-06-22 19:31:15,c033808b89b50e6f550caf266470f090,4.9,4.9,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,1,"editing, sourcing, writing",2,10,13,5,5,2,0,1,0
34,2017-06-22 19:06:14,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Accepted Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,6,sourcing,2,13,10,3,6,1,0,0,0
35,2017-06-22 19:06:08,6b19235b9269df81b6da879771bf40bc,0.0,0.0,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,sourcing,6,sourcing,3,13,12,3,6,2,0,1,0
126,2017-06-22 15:34:23,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,1,"vetting, planning, editing, sourcing, writing",5,19,4,2,0,2,0,0,0
127,2017-06-22 15:34:21,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,1,"vetting, planning, editing, sourcing, writing",6,19,5,2,1,2,0,0,0
128,2017-06-22 15:33:11,62060850630c7afe54fd59151413d237,4.82,4.82,Declined Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,2,"vetting, planning, editing, sourcing, writing",6,19,5,2,1,2,0,0,0
129,2017-06-22 15:32:36,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,2,"vetting, planning, editing, sourcing, writing",5,19,5,2,1,2,0,0,0
130,2017-06-22 15:32:36,62060850630c7afe54fd59151413d237,4.82,4.82,Declined Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,2,"vetting, planning, editing, sourcing, writing",5,19,5,2,1,2,0,0,0
131,2017-06-22 15:32:01,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c151273dd9c002873cd98,2017-06-22 15:05:54,vetting,1,"vetting, planning, editing, sourcing, writing",5,19,5,2,1,2,0,0,0


In [41]:
df[df['Analyst'] == '62060850630c7afe54fd59151413d237']

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
96,2017-06-22 16:29:48,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c28953b593b00281250e1,2017-06-22 16:29:09,vetting,4,"vetting, planning, editing, sourcing, writing",4,15,6,6,0,0,0,0,0
97,2017-06-22 16:29:33,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c28953b593b00281250e1,2017-06-22 16:29:09,vetting,3,"vetting, planning, editing, sourcing, writing",5,15,7,6,1,0,0,0,0
100,2017-06-22 16:09:53,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c15a573dd9c002873cda1,2017-06-22 15:08:21,sourcing,1,"vetting, planning, editing, sourcing, writing",0,17,4,4,0,0,0,0,0
101,2017-06-22 16:09:47,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c15a573dd9c002873cda1,2017-06-22 15:08:21,sourcing,1,"vetting, planning, editing, sourcing, writing",1,17,5,4,0,0,0,1,0
102,2017-06-22 16:05:31,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c20b773dd9c002873cdda,2017-06-22 15:55:35,vetting,1,"vetting, planning, editing, sourcing, writing",0,18,3,2,0,0,0,1,0
103,2017-06-22 16:05:27,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c20b773dd9c002873cdda,2017-06-22 15:55:35,vetting,1,"vetting, planning, editing, sourcing, writing",1,18,4,2,1,0,0,1,0
104,2017-06-22 16:02:15,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,vetting,1,"vetting, planning, editing, sourcing, writing",0,19,3,2,1,0,0,0,0
105,2017-06-22 16:02:11,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,vetting,1,"vetting, planning, editing, sourcing, writing",1,19,4,2,2,0,0,0,0
111,2017-06-22 15:57:02,62060850630c7afe54fd59151413d237,4.82,4.82,Accepted Job,594c1f5cd7e68f0028c9062c,2017-06-22 15:49:48,vetting,1,"vetting, planning, editing, sourcing, writing",2,19,5,2,2,0,0,1,0
112,2017-06-22 15:56:33,62060850630c7afe54fd59151413d237,4.82,4.82,Assigned Job,594c1f5cd7e68f0028c9062c,2017-06-22 15:49:48,vetting,1,"vetting, planning, editing, sourcing, writing",3,19,6,2,3,0,0,1,0


In [42]:
export_csv = df.to_csv (r'clean_assignment_log.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path

In [43]:
clean_df = pd.read_csv('clean_assignment_log.csv')

In [44]:
clean_df.head()

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
0,2017-06-22 19:59:06,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Accepted Job,594bec5c95e2ce005840c23a,2017-06-22 12:12:12,review,1,review,0,13,14,4,6,2,0,1,1
1,2017-06-22 19:59:02,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Assigned Job,594bec5c95e2ce005840c23a,2017-06-22 12:12:12,review,1,review,1,13,15,5,6,2,0,1,1
2,2017-06-22 19:51:30,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Declined Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,9,"sourcing, writing",1,12,12,5,5,1,0,0,1
3,2017-06-22 19:51:01,0e9802516f8a79dd0d45211dd4ee74af,4.5,4.5,Accepted Job,594c1f5cd7e68f0028c9062c,2017-06-22 15:49:48,sourcing,1,"sourcing, writing",1,11,12,5,5,1,0,0,1
4,2017-06-22 19:50:58,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,8,"sourcing, writing",2,11,14,5,5,2,0,1,1


In [45]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 789 entries, 0 to 788
Data columns (total 19 columns):
Event occurred at           789 non-null object
Analyst                     789 non-null object
Quality score (sourcing)    789 non-null float64
Quality score (writing)     789 non-null float64
Action                      789 non-null object
Request                     789 non-null object
Request created at          789 non-null object
Job                         789 non-null object
Wait time (min)             789 non-null int64
Waiting for                 789 non-null object
Analysts available          789 non-null int64
Analysts occupied           789 non-null int64
Total jobs available        789 non-null int64
Review jobs available       789 non-null int64
Vetting jobs available      789 non-null int64
Planning jobs available     789 non-null int64
Editing jobs available      789 non-null int64
Sourcing jobs available     789 non-null int64
Writing jobs available      789 non-nu

In [46]:
# connect to database
conn = sqlite3.connect(':memory:')

# create cursor object
cur = conn.cursor()

In [49]:
df = pd.read_csv('clean_assignment_log.csv')
df.to_sql('assignment_log', con = conn, if_exists='append', index=False)

In [48]:
cur.execute('''SELECT * FROM assignment_log;''')       # sql query
assign_df = pd.DataFrame(cur.fetchall())               # create dataframe from sql query
assign_df.columns = [x[0] for x in cur.description]    # labels dataframe columns
assign_df                                              # view dataframe

Unnamed: 0,Event occurred at,Analyst,Quality score (sourcing),Quality score (writing),Action,Request,Request created at,Job,Wait time (min),Waiting for,Analysts available,Analysts occupied,Total jobs available,Review jobs available,Vetting jobs available,Planning jobs available,Editing jobs available,Sourcing jobs available,Writing jobs available
0,2017-06-22 19:59:06,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Accepted Job,594bec5c95e2ce005840c23a,2017-06-22 12:12:12,review,1,review,0,13,14,4,6,2,0,1,1
1,2017-06-22 19:59:02,9fcbc63ff4c8bea5cea4efad782c87cf,5.0,5.0,Assigned Job,594bec5c95e2ce005840c23a,2017-06-22 12:12:12,review,1,review,1,13,15,5,6,2,0,1,1
2,2017-06-22 19:51:30,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Declined Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,9,"sourcing, writing",1,12,12,5,5,1,0,0,1
3,2017-06-22 19:51:01,0e9802516f8a79dd0d45211dd4ee74af,4.5,4.5,Accepted Job,594c1f5cd7e68f0028c9062c,2017-06-22 15:49:48,sourcing,1,"sourcing, writing",1,11,12,5,5,1,0,0,1
4,2017-06-22 19:50:58,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,8,"sourcing, writing",2,11,14,5,5,2,0,1,1
5,2017-06-22 19:50:58,0e9802516f8a79dd0d45211dd4ee74af,4.5,4.5,Assigned Job,594c1f5cd7e68f0028c9062c,2017-06-22 15:49:48,sourcing,1,"sourcing, writing",2,11,14,5,5,2,0,1,1
6,2017-06-22 19:50:58,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Declined Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,8,"sourcing, writing",2,11,14,5,5,2,0,1,1
7,2017-06-22 19:50:23,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594bec83fd2cf400280aa965,2017-06-22 12:12:51,writing,8,"sourcing, writing",1,11,14,5,5,2,0,1,1
8,2017-06-22 19:47:03,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Accepted Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,sourcing,4,"sourcing, writing",1,11,11,5,5,1,0,0,0
9,2017-06-22 19:46:56,85c7b78e76b5232cd38014ea4cdc8f56,4.35,4.35,Assigned Job,594c1e983b593b00281250ba,2017-06-22 15:46:32,sourcing,4,"sourcing, writing",2,12,13,5,5,2,0,1,0


In [55]:
cur.execute('''SELECT "Event occurred at" FROM assignment_log;''')       # sql query
time_df = pd.DataFrame(cur.fetchall())               # create dataframe from sql query
time_df.columns = [x[0] for x in cur.description]    # labels dataframe columns
time_df                                              # view dataframe

Unnamed: 0,Event occurred at
0,2017-06-22 19:59:06
1,2017-06-22 19:59:02
2,2017-06-22 19:51:30
3,2017-06-22 19:51:01
4,2017-06-22 19:50:58
...,...
1573,2017-06-21 20:21:32
1574,2017-06-21 20:21:03
1575,2017-06-21 20:20:57
1576,2017-06-21 20:15:45
