In [1]:
# Import Library Dependencies
from sqlalchemy import create_engine, engine, inspect
from config import user, password

import datetime as dt
import numpy as np
import pandas as pd
import os

In [2]:
# Establish CSV Data Path Reference
csvStudentRecords = '2019-06-22-StudentMobility.csv'
csvDistrictSchools = '2019-06-26-DistrictSchools.csv'

csvUploads = {
    'records':os.path.join('Datasets', csvStudentRecords),
    'schools':os.path.join('Datasets', csvDistrictSchools)
}

In [3]:
# Define & Rearrange Desired Column Headers
import_headers = [
    'NID',  # Student Id
    'GR',   # Grade`
    'SC',   # School Code
    'PR',   # School Program
    'ED',   # Entry Date
    'LD',   # Leave Date
    'ER'    # Exit Reason**
]

In [4]:
# Read CSV Data Into DataFrame
master_df = pd.read_csv(csvUploads['records'], low_memory=False)[import_headers]
master_df.head(10)

Unnamed: 0,NID,GR,SC,PR,ED,LD,ER
0,300265.0,12.0,27.0,,8/9/2017,5/24/2018,230.0
1,300456.0,12.0,28.0,,8/9/2017,5/24/2018,230.0
2,301428.0,12.0,23.0,,8/9/2017,5/24/2018,230.0
3,301661.0,12.0,20.0,,8/9/2017,9/6/2017,160.0
4,301759.0,12.0,61.0,I,8/9/2017,12/22/2017,167.0
5,303104.0,12.0,68.0,,8/9/2017,3/9/2018,165.0
6,303301.0,12.0,22.0,,8/9/2017,5/24/2018,167.0
7,303962.0,12.0,27.0,,8/9/2017,5/24/2018,230.0
8,304281.0,12.0,28.0,,8/9/2017,5/24/2018,230.0
9,304396.0,12.0,22.0,,8/9/2017,11/13/2017,440.0


In [5]:
# Rename Column Headers to Match Database Field Names
update_headers = {
    'NID': 'student_id',
    'GR': 'grade_level',
    'SC': 'school_id',
    'PR': 'program_id',
    'ED': 'entry_date',
    'LD': 'leave_date',
    'ER': 'exit_reason'
}

master_df = master_df.rename(columns=update_headers)
master_df.head()

Unnamed: 0,student_id,grade_level,school_id,program_id,entry_date,leave_date,exit_reason
0,300265.0,12.0,27.0,,8/9/2017,5/24/2018,230.0
1,300456.0,12.0,28.0,,8/9/2017,5/24/2018,230.0
2,301428.0,12.0,23.0,,8/9/2017,5/24/2018,230.0
3,301661.0,12.0,20.0,,8/9/2017,9/6/2017,160.0
4,301759.0,12.0,61.0,I,8/9/2017,12/22/2017,167.0


In [6]:
# Preview Dataset -- Check for Missing Data (Data Value Should Be Equal)
master_df.count()

student_id     62774
grade_level    62774
school_id      62774
program_id      7340
entry_date     62774
leave_date     21789
exit_reason    21787
dtype: int64

In [7]:
# Reduce/Truncate Dataset
# Truncate Dataset to Records Containing an Exit Reason
reduced_df = master_df.loc[master_df['exit_reason'].isnull()==False]

# Fill Remaining Missing Data In Program_Id Column With String Values For Later Data Conversion
reduced_df['program_id'] = reduced_df['program_id'].fillna('Gen')
reduced_df.count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


student_id     21787
grade_level    21787
school_id      21787
program_id     21787
entry_date     21787
leave_date     21787
exit_reason    21787
dtype: int64

In [8]:
# Specify Required Column Data Types
fmt_dates = [
    'entry_date', 
    'leave_date'
]

dtype = {
    'student_id':int,
    'grade_level':int,
    'school_id':int,
    'program_id':str,
    'exit_reason':int
}

# Preview Current Column Data Types
reduced_df.dtypes

student_id     float64
grade_level    float64
school_id      float64
program_id      object
entry_date      object
leave_date      object
exit_reason    float64
dtype: object

In [9]:
# Data Conversion on DataFrame Colums to Match Those Required By MySQL Database
converted_df = reduced_df.astype(dtype, errors='ignore')
converted_df[fmt_dates] = converted_df[fmt_dates].apply(pd.to_datetime, errors='ignore')
converted_df.dtypes

student_id              int32
grade_level             int32
school_id               int32
program_id             object
entry_date     datetime64[ns]
leave_date     datetime64[ns]
exit_reason             int32
dtype: object

In [10]:
# Create DataFrames For Initial MySQL Database Upload
# <Create> Students DataFrame Upload: 
students_df = pd.DataFrame()
students_df['id'] = converted_df['student_id'].unique()
students_df['fname'] = 'Protected'
students_df['lname'] = 'Protected'
students_df['updated_on'] = dt.datetime.today()
students_df['updated_by'] = 'Admin_User1'

students_df.head()

Unnamed: 0,id,fname,lname,updated_on,updated_by
0,300265,Protected,Protected,2019-06-29 08:41:35.707971,Admin_User1
1,300456,Protected,Protected,2019-06-29 08:41:35.707971,Admin_User1
2,301428,Protected,Protected,2019-06-29 08:41:35.707971,Admin_User1
3,301661,Protected,Protected,2019-06-29 08:41:35.707971,Admin_User1
4,301759,Protected,Protected,2019-06-29 08:41:35.707971,Admin_User1


In [11]:
# <Create> Schools DataFrame Upload:
school_headers = [
    'id',
    'name',
    'address',
    'city',
    'state',
    'zipcode',
    'lat',
    'lon'
]
schools_df = pd.read_csv(csvUploads['schools'])[school_headers]
schools_df.head()

Unnamed: 0,id,name,address,city,state,zipcode,lat,lon
0,20,Central Middle,1302 Queen Emma St,Honolulu,HI,96813,21.31166,-157.85664
1,21,Chiefess Kamakahelei Middle,"4431 Nuhou St, Lihue, HI 96766",Lihue,HI,96766,21.96699,-159.38714
2,22,Ewa Makai Middle,91-6291 Kapolei Parkway,Ewa Beach,HI,96706,21.31601,-158.02153
3,23,Governor Samuel Wilder King Intermediate,46-155 Kamehameha Hwy,Kaneohe,HI,96744,21.428209,-157.805603
4,24,Governor Sanford B. Dole Middle,1803 Kamehameha IV Road,Honolulu,HI,96819,21.3431,-157.871475


In [12]:
# <Create> Student Records DataFrame Upload
record_headers = [
    'student_id',
    'school_id',
    'program_id',
    'grade_level',
    'entry_date',
    'leave_date',
    'exit_reason'
]

records_df = converted_df[record_headers]
records_df.head()

Unnamed: 0,student_id,school_id,program_id,grade_level,entry_date,leave_date,exit_reason
0,300265,27,Gen,12,2017-08-09,2018-05-24,230
1,300456,28,Gen,12,2017-08-09,2018-05-24,230
2,301428,23,Gen,12,2017-08-09,2018-05-24,230
3,301661,20,Gen,12,2017-08-09,2017-09-06,160
4,301759,61,I,12,2017-08-09,2017-12-22,167


In [13]:
# <Create> School Programs DataFrame Upload
programs_df = pd.DataFrame()
programs_df['id'] = converted_df['program_id'].unique()
programs_df['name'] = 'Protected'
programs_df

Unnamed: 0,id,name
0,Gen,Protected
1,I,Protected
2,S,Protected
3,C,Protected
4,H,Protected
5,N,Protected


In [14]:
# Upload Student Profile Data to 'students' Table in student_mobility database
MySQL_DB_Connection = f'{user}:{password}@localhost/student_mobility'
    
engine = create_engine(f'mysql://{MySQL_DB_Connection}', echo=True)
engine.table_names()

2019-06-29 08:41:35,801 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'sql_mode'
2019-06-29 08:41:35,802 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,805 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2019-06-29 08:41:35,806 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,808 INFO sqlalchemy.engine.base.Engine SELECT DATABASE()
2019-06-29 08:41:35,809 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,810 INFO sqlalchemy.engine.base.Engine show collation where `Charset` = 'utf8mb4' and `Collation` = 'utf8mb4_bin'
2019-06-29 08:41:35,811 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,813 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS CHAR(60)) AS anon_1
2019-06-29 08:41:35,813 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,815 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS CHAR(60)) AS anon_1
2019-06-29 08:41:35,816 INFO sqlalchemy.engine.base.E

['programs', 'records', 'schools', 'students']

In [15]:
# [Upload #1] Upload Student Table Data
students_df.to_sql(
    name='students', 
    con=engine, 
    if_exists='append',
    index=False
)

2019-06-29 08:41:35,837 INFO sqlalchemy.engine.base.Engine DESCRIBE `students`
2019-06-29 08:41:35,838 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:35,844 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2019-06-29 08:41:35,899 INFO sqlalchemy.engine.base.Engine INSERT INTO students (id, fname, lname, updated_on, updated_by) VALUES (%s, %s, %s, %s, %s)
2019-06-29 08:41:35,900 INFO sqlalchemy.engine.base.Engine ((300265, 'Protected', 'Protected', datetime.datetime(2019, 6, 29, 8, 41, 35, 707971), 'Admin_User1'), (300456, 'Protected', 'Protected', datetime.datetime(2019, 6, 29, 8, 41, 35, 707971), 'Admin_User1'), (301428, 'Protected', 'Protected', datetime.datetime(2019, 6, 29, 8, 41, 35, 707971), 'Admin_User1'), (301661, 'Protected', 'Protected', datetime.datetime(2019, 6, 29, 8, 41, 35, 707971), 'Admin_User1'), (301759, 'Protected', 'Protected', datetime.datetime(2019, 6, 29, 8, 41, 35, 707971), 'Admin_User1'), (303104, 'Protected', 'Protected', datetime.datetime(2019, 6, 

In [16]:
# [Upload #2] Upload School Table Data
schools_df.to_sql(
    name='schools',
    con=engine,
    if_exists='append',
    index=False
)

2019-06-29 08:41:36,429 INFO sqlalchemy.engine.base.Engine DESCRIBE `schools`
2019-06-29 08:41:36,430 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:36,433 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2019-06-29 08:41:36,434 INFO sqlalchemy.engine.base.Engine INSERT INTO schools (id, name, address, city, state, zipcode, lat, lon) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
2019-06-29 08:41:36,435 INFO sqlalchemy.engine.base.Engine ((20, 'Central Middle', '1302 Queen Emma St', 'Honolulu', 'HI', 96813, 21.31166, -157.85664), (21, 'Chiefess Kamakahelei Middle', '4431 Nuhou St, Lihue, HI 96766', 'Lihue', 'HI', 96766, 21.96699, -159.38714), (22, 'Ewa Makai Middle', '91-6291 Kapolei Parkway', 'Ewa Beach', 'HI', 96706, 21.316010000000002, -158.02153), (23, 'Governor Samuel Wilder King Intermediate', '46-155 Kamehameha Hwy', 'Kaneohe', 'HI', 96744, 21.428209, -157.805603), (24, 'Governor Sanford B. Dole Middle', '1803 Kamehameha IV Road', 'Honolulu', 'HI', 96819, 21.3431, -157.87147

In [17]:
# [Upload #3] Upload School Program Data 
programs_df.to_sql(
    name='programs',
    con=engine,
    if_exists='append',
    index=False
)

2019-06-29 08:41:36,447 INFO sqlalchemy.engine.base.Engine DESCRIBE `programs`
2019-06-29 08:41:36,448 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:36,451 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2019-06-29 08:41:36,453 INFO sqlalchemy.engine.base.Engine INSERT INTO programs (id, name) VALUES (%s, %s)
2019-06-29 08:41:36,454 INFO sqlalchemy.engine.base.Engine (('Gen', 'Protected'), ('I', 'Protected'), ('S', 'Protected'), ('C', 'Protected'), ('H', 'Protected'), ('N', 'Protected'))
2019-06-29 08:41:36,455 INFO sqlalchemy.engine.base.Engine COMMIT


In [18]:
# [Upload #4] Upload Student Record Data
records_df.to_sql(
    name='records',
    con=engine,
    if_exists='append',
    index=False
)

2019-06-29 08:41:36,468 INFO sqlalchemy.engine.base.Engine DESCRIBE `records`
2019-06-29 08:41:36,469 INFO sqlalchemy.engine.base.Engine ()
2019-06-29 08:41:36,479 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2019-06-29 08:41:36,589 INFO sqlalchemy.engine.base.Engine INSERT INTO records (student_id, school_id, program_id, grade_level, entry_date, leave_date, exit_reason) VALUES (%s, %s, %s, %s, %s, %s, %s)
2019-06-29 08:41:36,590 INFO sqlalchemy.engine.base.Engine ((300265, 27, 'Gen', 12, datetime.datetime(2017, 8, 9, 0, 0), datetime.datetime(2018, 5, 24, 0, 0), 230), (300456, 28, 'Gen', 12, datetime.datetime(2017, 8, 9, 0, 0), datetime.datetime(2018, 5, 24, 0, 0), 230), (301428, 23, 'Gen', 12, datetime.datetime(2017, 8, 9, 0, 0), datetime.datetime(2018, 5, 24, 0, 0), 230), (301661, 20, 'Gen', 12, datetime.datetime(2017, 8, 9, 0, 0), datetime.datetime(2017, 9, 6, 0, 0), 160), (301759, 61, 'I', 12, datetime.datetime(2017, 8, 9, 0, 0), datetime.datetime(2017, 12, 22, 0, 0), 167), 

In [19]:
print('Initial MySQL Data Upload: <Complete>')

Initial MySQL Data Upload: <Complete>
