# This program prepares the selected data from 2006 8th Grade Cohort Longitudinal Study for mapping

### Begin by downloading the Cohort Workbook from [the THECB website](http://www.txhighereddata.org/index.cfm?objectId=F2CBE4A0-C90B-11E5-8D610050560100A9). 

The selected data focuses on target populations from the Texas Higher Education Strategic Plan. The target populations examined here are African American students, particularly African american male students, and Hispanic students.

### Target populations are examined by TEA Region which is how the data is presented by the THECB cohort workbook


In [1]:
import pandas as pd
import requests
import zipfile
import arcpy
import io
import os
pd.options.display.max_rows = 10
os.chdir('C:\\Users\John\Dropbox\MapDev\Eighth Grade Cohort Map')
os.getcwd()

'C:\\Users\\John\\Dropbox\\MapDev\\Eighth Grade Cohort Map'

### Start by downloading the 2006 cohort workbook from the THECB website

The cohort workbooks are available at: http://www.txhighereddata.org/index.cfm?objectId=F2CBE4A0-C90B-11E5-8D610050560100A9

Save the workbook in 'Data\CohortWorkbook2006.xlsx'

### First get the Gender by Ethnicity Data

This next part gets us:

African American males (a 60x30 target population) by TEA region.


In [2]:
xl = pd.read_excel('Data\CohortWorkbook2006.xlsx', sheetname='TEA by Gender by Ethnicity', header=None, index_col=None, skiprows=6)

#Keep the columns I need
xl2=xl[[0,1,2,3,4,17,18,21,22]]

#Drop the rows I don't need
GenEth=xl2[:160]
GenEth.columns=['TEAReg','RegName','Gender','Eth', 'CohoN', 'nEnr', 'pEnr', 'nComp', 'pComp']

#Make Dataset just of African American Males (60x30TX target popultation)
AAmales=GenEth.loc[(GenEth['Eth']=='African American') & (GenEth['Gender']=='Male')].copy() #copy to avoid chained indexing
AAmales=AAmales.drop(['Gender','Eth'], axis=1) #Keep the columns I need
AAmales.columns=['TEAReg','RegName','AAmCoho', 'AAmnEnr', 'AAmpEnr', 'AAmnComp', 'AAmpComp']
AAmales['AAmpEnr']=100*AAmales['AAmpEnr']
AAmales['AAmpComp']=100*AAmales['AAmpComp']

print(AAmales)
#AAmales.to_csv('AAmales.csv', index=False)

    TEAReg         RegName  AAmCoho  AAmnEnr    AAmpEnr  AAmnComp   AAmpComp
4        1        Edinburg     24.0     13.0  54.166667       4.0  16.666667
12       2  Corpus Christi    148.0     63.0  42.567568      15.0  10.135135
20       3        Victoria    226.0    106.0  46.902655      19.0   8.407080
28       4         Houston   8803.0   4277.0  48.585709     833.0   9.462683
36       5        Beaumont    946.0    446.0  47.145877      83.0   8.773784
..     ...             ...      ...      ...        ...       ...        ...
124     16        Amarillo    152.0     75.0  49.342105      12.0   7.894737
132     17         Lubbock    257.0     88.0  34.241245      12.0   4.669261
140     18         Midland    156.0     61.0  39.102564      10.0   6.410256
148     19         El Paso    197.0     84.0  42.639594      17.0   8.629442
156     20     San Antonio   1054.0    500.0  47.438330     120.0  11.385199

[20 rows x 7 columns]


This next part gets us:

African American and Hispanic totals by region. For this, we'll collapse on ethicity to remove gender.


In [3]:
#Keep Hispanic and African American counts, collapse to remove gender, and then recalculate percents 
EthCounts=GenEth.drop(GenEth.columns[[2,6,8]], axis=1) #axis=0 for rows, axis=1 for columns
#AA_Hisp=EthCounts.loc[EthCounts['Eth'].isin(['African American', 'Hispanic'])]
#AA_Hisp_collapsed=AA_Hisp.groupby(["TEAReg", "RegName","Eth"]).sum()


#Make African American Group
AAtemp=EthCounts.loc[EthCounts['Eth']=='African American'].copy() #copy to avoid chained indexing
AA=AAtemp.groupby(["TEAReg", "RegName","Eth"], as_index=False).sum()
AA['AApEnr']=100*AA['nEnr']/AA['CohoN']
AA['AApComp']=100*AA['nComp']/AA['CohoN']
AA=AA.drop(['Eth'], axis=1) #Keep the columns I need
AA.columns=['TEAReg','RegName','AACoho', 'AAnEnr','AAnComp','AApEnr','AApComp']

#Make Hispanic Group
Hisptemp=EthCounts.loc[EthCounts['Eth']=='Hispanic'].copy() #copy to avoid chained indexing
Hisp=Hisptemp.groupby(["TEAReg", "RegName","Eth"], as_index=False).sum()
Hisp['HispEnr']=100*Hisp['nEnr']/Hisp['CohoN']
Hisp['HispComp']=100*Hisp['nComp']/Hisp['CohoN']
Hisp=Hisp.drop(['Eth'], axis=1) #Keep the columns I need
Hisp.columns=['TEAReg','RegName','HisCoho', 'HisnEnr','HisnComp','HispEnr','HispComp']


#print(AA_Hisp_collapsed)
print(AA)
print(Hisp)
#AA.to_csv('AA.csv')
#Hisp.to_csv('Hisp.csv')

    TEAReg         RegName   AACoho  AAnEnr  AAnComp     AApEnr    AApComp
0        1        Edinburg     56.0    34.0     10.0  60.714286  17.857143
1        2  Corpus Christi    277.0   140.0     32.0  50.541516  11.552347
2        3        Victoria    434.0   224.0     54.0  51.612903  12.442396
3        4         Houston  17245.0  9436.0   2278.0  54.717309  13.209626
4        5        Beaumont   1870.0  1004.0    234.0  53.689840  12.513369
..     ...             ...      ...     ...      ...        ...        ...
15      16        Amarillo    318.0   176.0     38.0  55.345912  11.949686
16      17         Lubbock    494.0   200.0     38.0  40.485830   7.692308
17      18         Midland    323.0   135.0     30.0  41.795666   9.287926
18      19         El Paso    366.0   171.0     47.0  46.721311  12.841530
19      20     San Antonio   2034.0  1023.0    279.0  50.294985  13.716814

[20 rows x 7 columns]
    TEAReg         RegName  HisCoho  HisnEnr  HisnComp    HispEnr   HispComp


This next part gets us:

All Males by TEA Region for comparison

In [4]:
#Get total male counts by region, collape on gender, counts only.
GenCounts=GenEth.drop(GenEth.columns[[3,6,8]], axis=1) #axis=0 for rows, axis=1 for columns
Allmalestemp=GenCounts.loc[GenCounts['Gender']=='Male'].copy() #copy to avoid chained indexing
Allmales=Allmalestemp.groupby(["TEAReg", "RegName"], as_index=False).sum().copy()
Allmales['AllmpEnr']=100*Allmales['nEnr']/Allmales['CohoN']
Allmales['AApComp']=100*Allmales['nComp']/Allmales['CohoN']
Allmales.columns=['TEAReg', 'RegName','TotmCoho', 'TotmnEnr','TotmnComp','TotmpEnr','TotmpComp']


print(Allmales)
#Allmales.to_csv('Allmales.csv')

    TEAReg         RegName  TotmCoho  TotmnEnr  TotmnComp   TotmpEnr  \
0        1        Edinburg   13027.0    6924.0     2142.0  53.151148   
1        2  Corpus Christi    4177.0    1981.0      563.0  47.426383   
2        3        Victoria    2161.0    1024.0      401.0  47.385470   
3        4         Houston   38121.0   19462.0     6838.0  51.053225   
4        5        Beaumont    3130.0    1516.0      524.0  48.434505   
..     ...             ...       ...       ...        ...        ...   
15      16        Amarillo    3026.0    1556.0      600.0  51.421018   
16      17         Lubbock    2979.0    1403.0      488.0  47.096341   
17      18         Midland    2987.0    1313.0      407.0  43.957148   
18      19         El Paso    6592.0    3569.0      943.0  54.141383   
19      20     San Antonio   13961.0    6735.0     2205.0  48.241530   

    TotmpComp  
0   16.442773  
1   13.478573  
2   18.556224  
3   17.937620  
4   16.741214  
..        ...  
15  19.828156  
16  16.

In [5]:
xlEcon = pd.read_excel('Data\CohortWorkbook2006.xlsx', sheetname='TEA Region by Eco', header=None, index_col=None, skiprows=6)

#Keep the columns I need
xlEcon2=xlEcon[[0,1,2,3,16,17,20,21]]
EconTemp=xlEcon2.loc[xlEcon2[2]=='Economically Disadvantaged'].copy()

EconTemp2=EconTemp.drop([2], axis=1).copy()

#Get Region Totals and drop the rows I don't need
Econ=EconTemp2[:20].copy()
Econ.columns=['TEAReg','RegName','EcoCoho', 'EconEnr', 'EcopEnr', 'EconComp', 'EcopComp']

Econ['EcopEnr']=100*Econ['EcopEnr']
Econ['EcopComp']=100*Econ['EcopComp']
print(Econ)

   TEAReg         RegName  EcoCoho  EconEnr    EcopEnr  EconComp   EcopComp
1       1        Edinburg  22128.0  11836.0  53.488792    4021.0  18.171547
3       2  Corpus Christi   4755.0   1973.0  41.493165     433.0   9.106204
5       3        Victoria   2209.0    821.0  37.166139     231.0  10.457220
7       4         Houston  37986.0  16011.0  42.149739    4282.0  11.272574
9       5        Beaumont   3099.0   1356.0  43.756050     362.0  11.681187
..    ...             ...      ...      ...        ...       ...        ...
31     16        Amarillo   3040.0   1357.0  44.638158     374.0  12.302632
33     17         Lubbock   3233.0   1188.0  36.746056     284.0   8.784411
35     18         Midland   3013.0   1152.0  38.234318     301.0   9.990043
37     19         El Paso   9690.0   5214.0  53.808050    1502.0  15.500516
39     20     San Antonio  16670.0   7282.0  43.683263    1947.0  11.679664

[20 rows x 7 columns]


# Here we get overall totals by region for comparison

 1. All students for comparison to Ethnicity breakouts


In [6]:
xl = pd.read_excel('Data\CohortWorkbook2006.xlsx', sheetname='Summary', header=None, index_col=None, skiprows=16)

#Keep the columns I need
xl2=xl[[0,1,2,15,16,19,20]]

#Get Region Totals and drop the rows I don't need
RegTotals=xl2[:20].copy()
RegTotals.columns=['TEAReg','RegName','TotCoho', 'TotnEnr', 'TotpEnr', 'TotnComp', 'TotpComp']

RegTotals['TotpEnr']=100*RegTotals['TotpEnr']
RegTotals['TotpComp']=100*RegTotals['TotpComp']

print(RegTotals)
#RegTotals.to_csv('RegTotals.csv', index=False)

   TEAReg         RegName  TotCoho  TotnEnr    TotpEnr  TotnComp   TotpComp
0       1        Edinburg  25852.0  14627.0  56.579762    5322.0  20.586415
1       2  Corpus Christi   8181.0   4254.0  51.998533    1411.0  17.247280
2       3        Victoria   4068.0   2136.0  52.507375     929.0  22.836775
3       4         Houston  74103.0  40647.0  54.852030   16107.0  21.735962
4       5        Beaumont   6140.0   3332.0  54.267101    1274.0  20.749186
..    ...             ...      ...      ...        ...       ...        ...
15     16        Amarillo   5944.0   3384.0  56.931359    1405.0  23.637281
16     17         Lubbock   5776.0   2936.0  50.831025    1159.0  20.065789
17     18         Midland   5856.0   2894.0  49.419399    1031.0  17.605874
18     19         El Paso  12980.0   7454.0  57.426810    2405.0  18.528505
19     20     San Antonio  27156.0  14189.0  52.249963    5241.0  19.299602

[20 rows x 7 columns]


### Now get statewide Cohort totals for Hispanics and African Americans

In [7]:
xl = pd.read_excel('Data\CohortWorkbook2006.xlsx', sheetname='Summary', header=None, index_col=None, skiprows=38)

#Keep the columns I need
xl2=xl[[0,1,2]]

#Get Region Totals and drop the rows I don't need
StatewideCohortTotals=xl2[:8]
StatewideCohortTotals.columns=['Gender','Eth','Cohort']

#Get African American and Hispanic Statewide Cohort Totals
StatewideCohortTotals=StatewideCohortTotals.groupby(["Eth"]).sum().copy()

print(StatewideCohortTotals)
#StatewideCohortTotals.to_csv('StatewideCohortTotals.csv', index=False)

                    Cohort
Eth                       
African American   50671.0
Hispanic          144200.0
Others             11111.0
White             129726.0


### Now get statewide Cohort totals for Econ Disadvantage

In [8]:
xl = pd.read_excel('Data\CohortWorkbook2006.xlsx', sheetname='Summary', header=None, index_col=None, skiprows=52)

#Keep the columns I need
xl2=xl[[0,1,2]]

#Get Region Totals and drop the rows I don't need
StatewideCohortEcon=xl2[:4]
StatewideCohortEcon.columns=['Eco','Eth','Cohort']

#Get African American and Hispanic Statewide Cohort Totals
StatewideCohortEcon=StatewideCohortEcon.groupby(["Eco"]).sum().copy()

print(StatewideCohortEcon)

                            Cohort
Eco                               
Economically Disadvantaged  177058


### And now merge the tables

In [9]:
#set percentages to have just one decimal place
All=pd.merge(AA, AAmales,on=['TEAReg', 'RegName']).copy()
All=pd.merge(All, Hisp,on=['TEAReg', 'RegName']).copy()
All=pd.merge(All, Allmales,on=['TEAReg', 'RegName']).copy()
All=pd.merge(All, RegTotals,on=['TEAReg', 'RegName']).copy()
All=pd.merge(All, Econ,on=['TEAReg', 'RegName']).copy()


#Calculate Hisp, AA, and Econ % of statewide cohort for each TEA Region
All['AATXCoho']=StatewideCohortTotals.loc['African American','Cohort']
All['HisTXCoho']=StatewideCohortTotals.loc['Hispanic','Cohort']
All['EcoTXCoho']=StatewideCohortEcon.loc['Economically Disadvantaged','Cohort']
All['AApTXCoho']=100*All['AACoho']/All['AATXCoho']
All['HispTXCoho']=100*All['HisCoho']/All['HisTXCoho']
All['EcopTXCoho']=100*All['EcoCoho']/All['EcoTXCoho']


#Calculate % point differences for AA/Hisp/AAmales/Eco enrollmnet and completion rates from total
All['AAEnrpDi']=All['AApEnr']-All['TotpEnr']
All['HisEnrpDi']=All['HispEnr']-All['TotpEnr']
All['AAmEnrpDi']=All['AAmpEnr']-All['TotmpEnr']
All['EcoEnrpDi']=All['EcopEnr']-All['TotpEnr']
All['AAComppDi']=All['AApComp']-All['TotpComp']
All['HisComppDi']=All['HispComp']-All['TotpComp']
All['AAmComppDi']=All['AAmpComp']-All['TotmpComp']
All['EcoComppDi']=All['EcopComp']-All['TotpComp']

#Drop unneeded variables
Final=All.drop(['HisTXCoho','AATXCoho', 'EcoTXCoho'], axis=1).copy() #Keep the columns I need


#Make perc of total for AA, Hisp, Eco, and AA_males
Final['AApCoho']=100*All['AACoho']/All['TotCoho']
Final['HispCoho']=100*All['HisCoho']/All['TotCoho']
Final['AAmpCoho']=100*All['AAmCoho']/All['TotmCoho']
Final['EcopCoho']=100*All['EcoCoho']/All['TotCoho']

#Make variables to have zero decmals to use as symbol layers
Final['TotpEnr_']=Final['TotpEnr']
Final['TotpComp_']=Final['TotpComp'] 
Final['AApCoho_']=Final['AApCoho']
Final['AAmpCoho_']=Final['AAmpCoho']
Final['HispCoho_']=Final['HispCoho']
Final['EcopCoho_']=Final['EcopCoho']
Final['AAComppD_']=Final['AAComppDi']
Final['AAmComppD_']=Final['AAmComppDi']
Final['HisComppD_']=Final['HisComppDi']
Final['EcoComppD_']=Final['EcoComppDi']
Final['AAEnrpD_']=Final['AAEnrpDi']
Final['AAmEnrpD_']=Final['AAmEnrpDi']
Final['HisEnrpD_']=Final['HisEnrpDi']
Final['EcoEnrpD_']=Final['EcoEnrpDi']


#set percentages to have just one decimal place
Processed = Final.round({'AApEnr': 1, 'AApComp': 1, 'AAmpEnr': 1, 'AAmpComp': 1, 
             'HispEnr': 1, 'HispComp': 1, 'TotmpEnr': 1, 'TotmpComp': 1, 
             'TotpEnr': 1, 'TotpComp': 1, 'AApTXCoho': 1, 'AAEnrpDi': 1, 
             'HisEnrpDi': 1, 'AAmEnrpDi': 1, 'AAComppDi': 1, 'HisComppDi': 
             1, 'AAmComppDi': 1, 'AApCoho': 1, 'HispCoho': 1, 'AAmpCoho': 1,
             'EcopEnr': 1, 'EcopComp': 1, 'EcoEnrpDi': 1, 'EcoComppDi': 1, 
            'EcopTXCoho': 1,'HispTXCoho': 1, 'EcopCoho':1, 'TotpEnr_':0, 
            'TotpComp_':0, 'AApCoho_':0, 'AAmpCoho_':0, 'HispCoho_':0, 'EcopCoho_':0, 
            'AAComppD_':0, 'AAmComppD_':0, 'HisComppD_':0, 'EcoComppD_':0,
            'AAEnrpD_':0, 'AAmEnrpD_':0, 'HisEnrpD_':0, 'EcoEnrpD_':0}).copy()

Processed.to_csv('ProcessedData.csv', index=False)
print(Processed)
Processed.to_csv('ProcessedData.csv', index=False)
print(Processed)

   TEAReg         RegName   AACoho  AAnEnr  AAnComp  AApEnr  AApComp  AAmCoho  \
0       1        Edinburg     56.0    34.0     10.0    60.7     17.9     24.0   
1       2  Corpus Christi    277.0   140.0     32.0    50.5     11.6    148.0   
2       3        Victoria    434.0   224.0     54.0    51.6     12.4    226.0   
3       4         Houston  17245.0  9436.0   2278.0    54.7     13.2   8803.0   
4       5        Beaumont   1870.0  1004.0    234.0    53.7     12.5    946.0   
..    ...             ...      ...     ...      ...     ...      ...      ...   
15     16        Amarillo    318.0   176.0     38.0    55.3     11.9    152.0   
16     17         Lubbock    494.0   200.0     38.0    40.5      7.7    257.0   
17     18         Midland    323.0   135.0     30.0    41.8      9.3    156.0   
18     19         El Paso    366.0   171.0     47.0    46.7     12.8    197.0   
19     20     San Antonio   2034.0  1023.0    279.0    50.3     13.7   1054.0   

    AAmnEnr  AAmpEnr    ...

# The rest of the code prepares the shapefiles for mapping.

### We'll need:
    
* Polygons for TEA Regions [available from TEA](http://schoolsdata2-tea-texas.opendata.arcgis.com)
* Centroids (points) for TEA Regions
    
    

In [10]:
#get TEARegion file and unzip
URL=requests.get('https://opendata.arcgis.com/datasets/12142ff8beec4a1797334c9c41ba7b18_0.zip')
zippedRegions=zipfile.ZipFile(io.BytesIO(URL.content))
zippedRegions.extractall('Data/rawESC_Regions')

#Delete unnecessary fields
arcpy.DeleteField_management("Data/rawESC_Regions/ESC_Regions.shp", 
                             ["FID_1", "OBJECTID", "CITY", 'REGION', 'ORG_E_ID', 'WEBSITE', 'SHAPE_Leng'])

<Result 'Data/rawESC_Regions/ESC_Regions.shp'>

In [11]:
# Create a File Geodatabase and copy shapefile data
# uncomment the following line the first time code is run
arcpy.CreateFileGDB_management('Data',"Cohort.gdb")

arcpy.FeatureClassToGeodatabase_conversion('Data/rawESC_Regions/ESC_Regions.shp', 'Data/Cohort.gdb')

#List fields in dataset
fields = arcpy.ListFields('Data/Cohort.gdb/ESC_Regions')

for field in fields:
    print("{0} is a type of {1} with a length of {2}"
          .format(field.name, field.type, field.length))

OBJECTID is a type of OID with a length of 4
Shape is a type of Geometry with a length of 0
Shape_Length is a type of Double with a length of 8
Shape_Area is a type of Double with a length of 8


In [12]:
#Add Cohort data to GeoDataBase
arcpy.TableToTable_conversion('ProcessedData.csv', 'Data/Cohort.gdb', 'CohortData')

#Merge Cohort Data to TEA Region Polygons
arcpy.JoinField_management('Data/Cohort.gdb/ESC_Regions', 'OBJECTID','Data/Cohort.gdb/CohortData', 'TEAReg')

<Result 'Data/Cohort.gdb/ESC_Regions'>

In [13]:
os.makedirs('Data/FinalShapefiles')
#Export merged TEARegions with Cohort data to shapefile
arcpy.FeatureClassToShapefile_conversion ('Data/Cohort.gdb/ESC_Regions', 'Data/FinalShapefiles')

<Result 'Data\\FinalShapefiles'>

### Now make the centrids for the TEA Regions

(Requires the advanced license)

In [14]:
#  Set local variables
inFeatures = "Data/Cohort.gdb/ESC_Regions"
outFeatureClass = "Data/Cohort.gdb/ESC_Points"

# Use FeatureToPoint function to find a point inside each park
arcpy.FeatureToPoint_management(inFeatures, outFeatureClass)

<Result 'Data\\Cohort.gdb\\ESC_Points'>

In [15]:
#Export merged TEARegion Points to shapefile
arcpy.FeatureClassToShapefile_conversion ('Data/Cohort.gdb/ESC_Points', 'Data/FinalShapefiles')

<Result 'Data\\FinalShapefiles'>

### Now, go to linux and use the GDAL to convert shapefiles to geojson. Then use the Tippecanoe tool to make .MBtiles

I used the following commands:

* ogr2ogr -f GeoJSON Cohort2006TEARegionPolys.json Data/FinalShapefiles/ESC_Regions.shp -progress
* ogr2ogr -f GeoJSON Cohort2006TEARegionPoints.json Data/FinalShapefiles/ESC_Points.shp -progress
* tippecanoe --output=Cohort2006TEARegionData.mbtiles Cohort2006TEARegionPoints.json Cohort2006TEARegionPolys.json -r1 --drop-fraction-as-needed  --simplification=9 --maximum-zoom=15 --minimum-zoom=3 --exclude=OBJECTID_1 --detect-shared-borders