## PASSNYC - 
PASSNYC is a not-for-profit organization that facilitates a collective impact that is dedicated to broadening educational opportunities for New York City's talented and underserved students. New York City is home to some of the most impressive educational institutions in the world, yet in recent years, the City’s specialized high schools - institutions with historically transformative impact on student outcomes - have seen a shift toward more homogeneous student body demographics.

PASSNYC uses public data to identify students within New York City’s under-performing school districts and, through consulting and collaboration with partners, aims to increase the diversity of students taking the Specialized High School Admissions Test (SHSAT). By focusing efforts in under-performing areas that are historically underrepresented in SHSAT registration, we will help pave the path to specialized high schools for a more diverse group of students.

In [None]:
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
from PIL import  Image
%matplotlib inline
img = np.array(Image.open(r"../input/image-nyc/download.png"))
fig = plt.figure(figsize=(15,15))
plt.imshow(img,interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:

import pandas as pd
import seaborn as sns
import itertools
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud,STOPWORDS
import io
import base64
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.basemap import Basemap
import folium
import folium.plugins
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
#print(os.listdir("../input"))


## Schools - Data
Consolidated school characteristics such as student demographics and standardized test performance from public data sources for 1270 NYC public elementary and middle schools

In [None]:
schools  = pd.read_csv(r"../input/data-science-for-good/2016 School Explorer.csv")
schools.head(3)



## SHSAT (Specialized High School Test) data :
District 5 (Central Harlem) SHSAT (Specialized High School Test) data from the NYC Department of Education: Time series enrollment, SHSAT registrations & participation for that community

In [None]:
shsat = pd.read_csv(r"../input/data-science-for-good/D5 SHSAT Registrations and Testers.csv")
shsat.head(3)

# Data Dimensions

In [None]:
print ("Schools Data :")
print ("===============")
print ("Number of rows : ",schools.shape[0],"; Number of columns : ",schools.shape[1])
print ("     ")
print ("SHSAT test Data : ")
print ("===============")
print ("Number of rows : ",shsat.shape[0]," ; Number of columns : ",shsat.shape[1])



# Missing values percentage - Schools data

In [None]:
sns.set_style("darkgrid")
miss_school = schools.isnull().sum().reset_index()
miss_school = miss_school[miss_school[0] != 0]
miss_school["percentage"] = (miss_school[0]*100)/schools.shape[0]
miss_school = miss_school.sort_values(by = "percentage",ascending =False)

plt.figure(figsize=(8,10))
ax = sns.barplot("percentage","index",data=miss_school,
                 linewidth=1 ,edgecolor="k"*len(miss_school))
plt.ylabel("columns")
for i,j in enumerate(np.around(miss_school["percentage"],1).astype(str) + " %"):
    ax.text(.7,i,j ,weight = "bold")
plt.title("Percentage of missing values in Schools data")
plt.grid(True)
plt.show()

In [None]:
#Dropping columns with missing values greater than 40%
m_cols = miss_school["index"][:3].tolist()
schools = schools.drop(columns=m_cols,axis=1)

# Percentage of community schools
* 6% of community schools

In [None]:
lab = schools["Community School?"].value_counts().keys()
val = schools["Community School?"].value_counts().values

trace = go.Pie(labels = lab,
               values = val,
               hoverinfo = "label+percent+name",
               name  = "Community Schools ",
               domain = dict(x = [0,0.48]),
               hole = .5,
               marker = dict(line = dict(width = 1),
                            )
              )

data = [trace]

layout = go.Layout(dict(title = "Percentage of Community Schools"))

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

## Location map for community schools  & non community schools

In [None]:
print ("Count of community schools     : ", schools[schools["Community School?"] == "Yes"].shape[0])
print ("Count of non-community schools : ",schools[schools["Community School?"] == "No"].shape[0])

In [None]:
lat = schools[schools["Community School?"] == "Yes"]["Latitude"].tolist()
lon = schools[schools["Community School?"] == "Yes"]["Longitude"].tolist()
pop = schools[schools["Community School?"] == "Yes"]["School Name"].tolist()


lat1 = schools[schools["Community School?"] == "No"]["Latitude"].tolist()
lon1 = schools[schools["Community School?"] == "No"]["Longitude"].tolist()
pop1 = schools[schools["Community School?"] == "No"]["School Name"].tolist()


m1 = folium.Map(location=[40.70,-73.93],tiles="openstreetmap",max_zoom=15)

for k,l,p1 in zip(lat1,lon1,pop1):
    folium.CircleMarker(location=[k,l],
                  color = "royalblue",
                  fill_color = "blue",
                  fill =True,
                  radius=1).add_to(m1)
    
for i,j,p in zip(lat,lon,pop):
    folium.CircleMarker(location=[i,j],
                  popup=p,
                  color = "crimson",
                  fill_color = "crimson",
                  fill =True,
                  radius=1).add_to(m1)

m1

# Schools by district
* District 9 & 10 of Bronx and District 31 of richmond have highest number of schools.

In [None]:
dist = schools["District"].value_counts().reset_index()
dist.columns = ["district","count"]
dist = dist.sort_values(by = "district" ,ascending = True)

sch_dist = pd.read_csv(r"../input/nyc-school-district-breakdowns/school-district-breakdowns.csv")
sch_dist

dist = dist.reset_index()
dist = dist[["district","count"]]
dist = dist.merge(sch_dist,left_index=True,right_index=True,how="left")
dist = dist[["district","count","JURISDICTION NAME"]]
dist

plt.figure(figsize=(12,7))
sns.barplot("JURISDICTION NAME","count",
            data=dist,
            palette = "rainbow",
            linewidth = 1,
            edgecolor = "k" * len(dist))

plt.xticks(rotation = 90)
plt.grid(True)
plt.title("total number of Schools by district")
plt.show()

# Schools by NYC districts
* choropleth map for count of schools in each district .
* New York, there are 32 school districts . Bronx 9,Bronx 10 and Richmond 31 districts have highest number of schools

In [None]:
import os
dist_geo = os.path.join(r"../input/nyc-districts-json/ny_school_districts-simplify2.json")

m2 = folium.Map(location=[40.70,-73.93],tiles="stamen terrain",max_zoom=14)

m2.choropleth(geo_data=dist_geo,
              line_weight=1,
              line_opacity=.7,
              fill_opacity=.6,
              line_color="black",
              highlight=True,
              legend_name="Number of Schools",
              name="choropleth",
              columns=["district","count"],
              data=dist,
              fill_color="YlOrRd",
              key_on="feature.properties.SchoolDist")

folium.LayerControl().add_to(m2)

m2

# Number of schools by cities
* Most of the schools belong to cities Brooklyn ,Bronx ,New york and Staten Island

In [None]:
plt.figure(figsize=(13,7))
ax = sns.countplot(schools["City"],
                     order=schools["City"].value_counts().index,
                     palette = "husl",
                     linewidth = 1,
                     edgecolor = "k"*schools["City"].nunique())
plt.xticks(rotation = 90)
plt.grid(True)
plt.title("Number of schools by cities")
plt.show()

# Distribution of Economic Need Index
## Economic Need Index :
A school’s Economic Need Index estimates the percentage of students at the school
facing economic hardship. The metric is calculated as follows:
*  If the student is eligible for public assistance from the NYC Human
Resources Administration (HRA) or lived in temporary housing in the past
four years, the student’s Economic Need Value is 1.0.
* Otherwise, the student’s Economic Need Value is based on the percentage
of families (with school-age children) in the student’s Census tract whose
income is below the poverty level, as estimated by the American Community
Survey 5-Year Estimate. The student’s Economic Need Value equals this
percentage divided by 100.
* The school’s Economic Need Index is the average of its students’ Economic
Need Values.
* The school’s Economic Need Index is used as part of the matching process to create
Comparison Groups.

In [None]:
plt.figure(figsize=(13,7))
sns.distplot(schools[schools["Economic Need Index"].notnull()]["Economic Need Index"],
             rug =True,color="b")
plt.title("Distribution of Economic Need Index")
plt.show()

# Distribution of Economic Need Index by community and non community schools
* Community Schools have higher average Economic Need Index compared to Non community type schools

In [None]:
plt.figure(figsize=(13,8))

sns.kdeplot(schools[schools["Community School?"] == "Yes"]["Economic Need Index"],
            label = "Community School - YES",shade=True,color="b")
plt.axvline(schools[schools["Community School?"] == "Yes"]["Economic Need Index"].mean(),
            color="b",linestyle="dashed",label = "Community School YES - Mean")

sns.kdeplot(schools[schools["Community School?"] == "No"]["Economic Need Index"],
            label = "Community School - NO",shade=True,color="r")
plt.axvline(schools[schools["Community School?"] == "No"]["Economic Need Index"].mean(),
            color="r",linestyle="dashed",label = "Community School NO - Mean")

plt.legend(loc="best",prop = {"size" : 15})

plt.title("Distribution of Economic Need Index by community and non community schools")
plt.show()

# Economic Need Index by school locations
* Economic need index of individual school by their latitude and longitude location

In [None]:

trace1 = go.Scatter(x=schools["Longitude"],y=schools["Latitude"],mode="markers",
                    marker = dict(size=10,
                                 color = schools["Economic Need Index"],
                                 colorscale = "Rainbow",
                                 showscale = True,
                                  line = dict(width=1),
                                  colorbar = dict(title = "Economic Need Index",
                                                  titleside = "right",
                                                 )
                                 ),
                    text = schools["School Name"] +" ,Economic index : "+ schools["Economic Need Index"].astype(str),
                   ) 


data = [trace1]

layout = go.Layout(dict(title="Economic Need index by school location",
                  xaxis = dict(title="Longitude"),
                  yaxis = dict(title="Latitude"),
                       autosize=False,
                       height = 600,
                       width  = 750,
                       )
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Average Economic Need Index by District

* choropleth map for average ENI for school districts.
* District 2 & District 26 have lowest Average Economic Need Index among all the districts.
* On average Districts located in Bronx have higher average ENI

In [None]:
dist_ec_mean = schools.groupby("District")["Economic Need Index"].mean().reset_index()
dist = dist.merge(dist_ec_mean,left_on="district",right_on="District",how="left")
dist = dist.drop(columns={"District"},axis=1)
dist["JURISDICTION NAME"] = dist["JURISDICTION NAME"].str.replace("CSD ","")
m3 = folium.Map(location=[40.70,-73.93],tiles="openstreetmap",max_zoom=14)

m3.choropleth(name="choropleth",
              fill_opacity=.7,
              line_color="black",
              line_weight=1,
              highlight=True,
              legend_name="Average Economic Need Index by District",
              data=dist,
              columns=["district","Economic Need Index"],
              geo_data=dist_geo,
              fill_color="Spectral",
              key_on="feature.properties.SchoolDist")
m3

In [None]:
trace1 = [go.Bar(x=dist["JURISDICTION NAME"],
               y=dist["Economic Need Index"],
               text = "Average Economic Need Index",
               marker = dict(color=dist["Economic Need Index"],
                             colorscale="Viridis",
                             line=dict(width = 1,color="black"),
                             opacity = 1),
              )]

layout = go.Layout(dict(title="Average Economic Need Index by District",
                        yaxis    = dict(title = "avg ENI"),
                        xaxis    = dict(showgrid = True), 
                        autosize = False,
                        height   = 500,
                        width    = 780,
                       ),)

fig = go.Figure(data=trace1,layout=layout)

py.iplot(fig)
plt.show()

# Schools with highest Economic Need Index
Schools with highest Economic Need Index are mostly located in Brooklyn and Bronx cities.

In [None]:
high_eni = schools.sort_values(by = "Economic Need Index",ascending=False)[["School Name","Economic Need Index","City"]][:20]
#high_eni = high_eni.sort_values(by = "Economic Need Index",ascending=True)

plt.figure(figsize = (6,10))
ax = sns.barplot("Economic Need Index","School Name",
                data = high_eni,
                linewidth = 1,
                edgecolor = "k"*len(high_eni))
for i,j in enumerate(("ENI : " + np.around(high_eni["Economic Need Index"],3).astype(str) + 
                      " , City : " +high_eni["City"]+".")):
    ax.text(.01,i,j,fontsize =12)
plt.grid(True)    
plt.title("Schools with highest Economic Need Index")
plt.show()



# Schools with Least Economic Need Index
Schools with Least Economic Need Index are mostly  located in Newyork city

In [None]:
lw_eni = schools.sort_values(by= "Economic Need Index",ascending=True)[["School Name","Economic Need Index","City"]][:20]
lw_eni

plt.figure(figsize=(7,10))
ax = sns.barplot("Economic Need Index","School Name",
                data = lw_eni,
                linewidth = 1,
                edgecolor = "k"*len(lw_eni))

for i,j in enumerate(("ENI : " + np.around(lw_eni["Economic Need Index"],3).astype(str) +
                      " , City : " + lw_eni["City"])):
    ax.text(.001,i,j,fontsize =12)
plt.grid(True)
plt.title("Schools with Least Economic Need Index")
plt.show()

# Average Economic Need Index by top cities
Average economic need index for top 10 cities with highest number of schools

In [None]:
cty_lst = schools["City"].value_counts()[:10].keys().tolist()
cty_eni = schools.groupby("City")["Economic Need Index"].mean().reset_index()
cty_eni = cty_eni[cty_eni["City"].isin(cty_lst)]
cty_eni = cty_eni.sort_values(by = "Economic Need Index",ascending=True)

trace = go.Bar(y = cty_eni["Economic Need Index"],
               x = cty_eni["City"],
              marker = dict(color = cty_eni["Economic Need Index"],
                            colorscale = "BlackBody",
                            line = dict(width = 1),
                           ),
              )

layout = go.Layout(dict(title = "Average Economic Need Index by top cities" ,
                       autosize = False,
                        height = 500,
                        width = 780,
                        xaxis = dict(
                                     showgrid =True),
                        yaxis = dict(title = "Economic Need Index")
                       ),
                  )

data = [trace]

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Distribution School Income Estimate

In [None]:
schools["School Income Estimate"] = schools["School Income Estimate"].str.replace("[$,]","").astype(float)
trace = go.Histogram(x = schools[schools["School Income Estimate"].notnull()]["School Income Estimate"],
                    opacity = .8,
                    marker = dict(color = "olive",
                                  line = dict(width = 1)
                                 )
                     ,)
layout = go.Layout(dict(title = "Distribution School Income Estimate"),
                   xaxis = dict(title = "Income estimate",
                                showgrid =True,
                               )
                  )

data = [trace]

fig  = go.Figure(data=data,layout = layout)
py.iplot(fig)
plt.show()

# Distribution of School income estimate for school types
* income estimate distribution for community and non community school types.


In [None]:
schools1 = schools[schools["School Income Estimate"].notnull()]

trace1 = go.Histogram(x = schools1[schools1["Community School?"] == "Yes"]["School Income Estimate"],
                      name = "Community school",
                      opacity = .75,
                      marker = dict(color = "blue",
                                  line = dict(width = 1)))

trace2 = go.Histogram(x = schools1[schools1["Community School?"] == "No"]["School Income Estimate"],
                      name = "Non Community school",
                      opacity = .75,
                      marker = dict(color = "Orange",
                                   line = dict(width = 1)))


layout = go.Layout(dict(title = "Distribution of School income estimate for school types",
                        xaxis = dict(title = "School Income Estimate",
                                    showgrid = True),
                        ),
                  barmode = "stack")

data = [trace1,trace2]

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

# School Income Estimate for school locations 
* School Income Estimate of individual school by their latitude and longitude location

In [None]:
trace = go.Scatter(x = schools1["Longitude"],y = schools1["Latitude"],
                   mode="markers",
                   marker = dict(size  = 10,
                                 color = schools1["School Income Estimate"],
                                 colorscale = "Picnic",
                                 showscale = True,
                                 line = dict(width = 1),
                                 colorbar = dict(title = "School Income Estimate",
                                                 titleside = "right"),
                                ),
                   text = schools1["School Name"] + ", Income estimate : " + schools1["School Income Estimate"].astype(str) )

layout = go.Layout(dict(title = "School Income Estimate for school locations ",
                        xaxis = dict(title = "Longitude"),
                        yaxis = dict(title = "Latitude"),
                        autosize = False,
                        height = 600,
                        width  = 780,
                       )
                  )

data = [trace]

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Average School Income Estimate by districts
* choropleth map for average School Income Estimate for school districts.


In [None]:
dis_sie = schools1.groupby("District")["School Income Estimate"].mean().reset_index()
dist = dist.merge(dis_sie,left_on="district",right_on="District",how="left")
dist = dist.drop(columns="District",axis=1)
dist
m4 = folium.Map(location=[40.70,-73.93],tiles="stamen terrain",max_zoom=14)

m4.choropleth(name = "choropleth",
              line_color="black",
              line_weight=1,
              fill_opacity=.8,
              highlight=True,
              legend_name="Average School Income Estimate by districts",
              fill_color="Spectral",
              key_on="feature.properties.SchoolDist",
              data=dist,
              columns=["district","School Income Estimate"],
              geo_data=dist_geo)

m4

# School Income Estimate by districts
* Box plot for School income estimates across districts.
* Districts 2 ,26 ,31 have higher average school income estimates.
* Districts 7,8,9,10,12 from bronx have lower average school income estimates.

In [None]:
dist_X = dist.copy()
dist_X = dist_X[["district","JURISDICTION NAME"]]
schools = schools.merge(dist_X,left_on="District",right_on="district",how="left") 
schools1 = schools1.merge(dist_X,left_on="District",right_on="district",how="left")
trace = go.Box(x = schools1["JURISDICTION NAME"] , y = schools1["School Income Estimate"],
              jitter = .5,
              marker = dict(color = 'blue',size =5,
                            line = dict(width=1)))

layout = go.Layout(dict(title = "School Income Estimate for districts",
                        yaxis = dict(title = "Income estimate"),
                        xaxis = dict(showgrid = True),
                        autosize = False,
                        height = 600,
                        width = 800
                       ),
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Schools with highest Income Estimates
Top schools with highest income estimates mostly belong to newyork city.

In [None]:
high_sie = schools.sort_values(by = "School Income Estimate",ascending=False)[["School Name","School Income Estimate","City"]][:20]

plt.figure(figsize = (7,10))
ax = sns.barplot("School Income Estimate","School Name",
                data = high_sie,
                linewidth = 1,
                edgecolor = "k"*len(high_sie))
for i,j in enumerate((" IncEst : " + np.around(high_sie["School Income Estimate"],0).astype(str) + 
                      " , City : " +high_sie["City"]+".")):
    ax.text(.1,i,j,fontsize =12)
plt.grid(True)    
plt.title("Schools with highest Income Estimates")
plt.show()


# Schools with least Income Estimates

In [None]:
lw_sie = schools.sort_values(by = "School Income Estimate",ascending=True)[["School Name","School Income Estimate","City"]][:20]

plt.figure(figsize = (6,10))
ax = sns.barplot("School Income Estimate","School Name",
                data = lw_sie,
                linewidth = 1,
                edgecolor = "k"*len(lw_sie))
for i,j in enumerate((" IncEst : " + np.around(lw_sie["School Income Estimate"],0).astype(str) + 
                      " , City : " +lw_sie["City"]+".")):
    ax.text(.1,i,j,fontsize =12)
plt.grid(True)    
plt.title("Schools with least Income Estimates")
plt.show()


# Average School Income Estimate by top cities
* Average income estimates for top 10 cities with highest number of schools.

In [None]:
cty_lst = schools["City"].value_counts()[:10].keys().tolist()
cty_sie = schools.groupby("City")["School Income Estimate"].mean().reset_index()
cty_sie = cty_sie[cty_sie["City"].isin(cty_lst)]
cty_sie = cty_sie.sort_values(by = "School Income Estimate",ascending=True)

trace = go.Bar(y = cty_sie["School Income Estimate"],
               x = cty_sie["City"],
              marker = dict(color = cty_sie["School Income Estimate"],
                            colorscale = "BlackBody",
                            line = dict(width = 1),
                           ),
              )

layout = go.Layout(dict(title = "Average School Income Estimate by top cities" ,
                       autosize = False,
                        height = 500,
                        width = 780,
                        xaxis = dict(title = "City",showgrid = True),
                        yaxis = dict(title = "School Income Estimate")
                       ),
                  )

data = [trace]

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Relation between Economic need index and School income estimate
* Scatter plot between economic need index and school income estimate

In [None]:
trace = go.Scatter(y = schools1["Economic Need Index"],
                   x = schools1["School Income Estimate"],
                   mode = "markers",
                   marker  = dict(size = 10 ,
                                  color = schools1["District"],
                                  colorscale = "Picnic",
                                  showscale = True ,
                                  colorbar = dict(title = "District ",
                                                 titleside = "right"),
                                  line = dict(width = 1,
                                             color = "black"),
                                 ),
                   text = schools1["School Name"] + ", District : " + schools1["District"].astype(str),
                   )

data = [trace]

layout = go.Layout(dict(title = "Relation between Economic need index and School income estimate",
                        xaxis = dict(title = "School income Estimate"),
                        yaxis = dict(title = "Economic need Index"),
                        autosize = False,
                        height  = 600,
                        width   = 780,   
                       ),
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

In [None]:
cols = ["Percent ELL","Percent Asian","Percent Black",'Percent Hispanic',
       'Percent Black / Hispanic', 'Percent White',
       'Student Attendance Rate',
       'Percent of Students Chronically Absent', 'Rigorous Instruction %',
        'Collaborative Teachers %', 'Supportive Environment %',
       'Strong Family-Community Ties %','Trust %',"Effective School Leadership %"]

for i in cols:
    schools[i] = schools[i].str.replace("%","")
    schools[i] = schools[i].astype(float)

# Distribution of ethnicity types
* Kernel density estimation plot ethnicity types percentage across schools.

In [None]:
gl = ["Percent Asian","Percent Black",'Percent Hispanic',
       'Percent White']
cs = sns.color_palette("gist_ncar",4)

length = len(gl)

plt.figure(figsize=(12,7))
for i,j,k in itertools.zip_longest(gl,range(length),cs):
    sns.kdeplot(schools[i],shade=True,linewidth =2,color = k)
    plt.title("Distribution of ethnicity types")
    plt.legend(loc = "best" ,prop = {"size" : 14})
    plt.axvline(schools[i].mean(),
                linestyle = "dashed",color = k,label = i + " mean")

# Percent  English language learner by location
*  English-language learners, or ELLs, are students who are unable to communicate fluently or learn effectively in English, who often come from non-English-speaking homes and backgrounds, and who typically require specialized or modified instruction in both the English language and in their academic courses.
* percentage of ELL by location of individual school.

In [None]:
trace = go.Scatter(x = schools["Longitude"], y = schools["Latitude"],
                   mode = "markers",
                   marker = dict(size = 10 ,
                                 color = schools["Percent ELL"],
                                 colorscale = "Earth",
                                 line = dict(width=1,color = "black"),
                                 showscale = True,
                                 colorbar = dict(title = "Percent ELL",
                                                 titleside = "right"),
                                ),
                   text = schools["School Name"] + ", Percent ELL : " +schools["Percent ELL"].astype(str),
                   opacity = .8 ,
                  )

layout = go.Layout(dict(title = "Percent  English language learner by location",
                       autosize = False,
                       height = 600,
                       width  = 780,
                       xaxis  = dict(title = "Longitude"),
                       yaxis  = dict(title = "Latitude"),
                       ),
                )

data = [trace]

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

#  Ethnicity Percentage  by School location
* Asian ,Black ,Hispanic, White students percentage by location.

In [None]:
def location(column,palette):
    trace = go.Scatter(x = schools["Longitude"], y = schools["Latitude"],
                   mode = "markers",
                   marker = dict(size = 10 ,
                                 color = schools[column],
                                 colorscale = palette,
                                 line = dict(width=1,color = "black"),
                                 showscale = True,
                                 colorbar = dict(title = column,
                                                 titleside = "right")
                                ),
                   text = schools["School Name"] + ", percent : " +schools[column].astype(str),
                    opacity = .8 ,
                  )

    layout = go.Layout(dict(title =  column + " by location",
                       autosize = False,
                       height = 600,
                       width  = 780,
                       xaxis  = dict(title = "Longitude"),
                       yaxis  = dict(title = "Latitude"),
                       ),
                )

    data = [trace]

    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig)
    plt.show()

In [None]:
location("Percent Asian","Rainbow")

In [None]:
location("Percent Black","Rainbow")

In [None]:
location("Percent Hispanic","Rainbow")

In [None]:
location("Percent Black / Hispanic","Rainbow")

In [None]:
location("Percent White","Rainbow")

# Mean Ethnicity Percent by District
* Average percentage of each ethnicity group(Asian,Black,Hispanic,White) for all 32 school districts.

In [None]:
#schools = schools.merge(dist_X,left_on="District",right_on="district",how="left") 
cols = ['Percent ELL',
 'Percent Asian',
 'Percent Black',
 'Percent Hispanic',
 'Percent Black / Hispanic',
 'Percent White',
 'Student Attendance Rate',
 'Percent of Students Chronically Absent',
 'Rigorous Instruction %',
 'Collaborative Teachers %',
 'Supportive Environment %',
 'Strong Family-Community Ties %',
 'Effective School Leadership %',
 'Trust %']

percent_dist = schools.groupby(['JURISDICTION NAME',"District"])[cols].mean().reset_index()
percent_dist

trace = go.Bar(y = percent_dist["Percent Asian"] ,
               x = percent_dist["JURISDICTION NAME"],
              name = "Percent Asian",
              marker = dict(line = dict(width = .5),
                             opacity = 1,
                           )
              )

trace1 = go.Bar(y = percent_dist["Percent Black"] ,
                x = percent_dist["JURISDICTION NAME"],
              name = "Percent Black",
               marker = dict(line = dict(width = .5),
                              opacity = 1,
                           )
               )

trace2 = go.Bar(y = percent_dist["Percent Hispanic"] ,
                x = percent_dist["JURISDICTION NAME"],
              name = "Percent Hispanic",
               marker = dict(line = dict(width = .5),
                              opacity = 1,
                           )
               )

trace3 = go.Bar(y = percent_dist["Percent White"] ,
                x = percent_dist["JURISDICTION NAME"],
              name = "Percent White",
               marker = dict(line = dict(width = .5),
                             opacity = 1,
                           )
               )

data = [trace,trace1,trace2,trace3]

layout = go.Layout(dict (barmode = "stack",
                         title = "Mean Ethnicity Percent by District",
                         yaxis = dict(title = "Mean Percent"),
                        ),
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
print ("\nDistricts with highest Asian student percentage    : \n",percent_dist[percent_dist["Percent Asian"]  > 40 ]["JURISDICTION NAME"].tolist())
print ("\nDistricts with highest Black student percentage    : \n",percent_dist[percent_dist["Percent Black"]  > 40 ]["JURISDICTION NAME"].tolist())
print ("\nDistricts with highest Hispanic student percentage : \n",percent_dist[percent_dist["Percent Hispanic"] > 40 ]["JURISDICTION NAME"].tolist())
print ("\nDistricts with highest White  student percentage   : \n",percent_dist[percent_dist["Percent White"] > 40 ]["JURISDICTION NAME"].tolist())


# Mean Ethnicity percent for districts

In [None]:
percent_dist

def map_column_data(column,tile,palette) :
    mp = folium.Map(location=[40.70,-73.93],tiles="stamen terrain",max_zoom=14)
    mp.choropleth(name="choropleth",
                  highlight = True,
                  line_color="black",
                  line_weight=1,
                  legend_name="Mean "+column+" by district",
                  fill_opacity=.7,
                  key_on="feature.properties.SchoolDist",
                  geo_data=dist_geo,
                  columns=["District",column],
                  data=percent_dist,
                  fill_color=palette)
    display(mp)

# Asian Student Percentage
* Choropleth map for average asian student percentage across districts.
* '20 Brooklyn', '25 Queens', '26 Queens'  Districts have higher asian student percentage on average

In [None]:
map_column_data("Percent Asian","stamen terrain","YlOrRd")

# Black Students Percentage
* Choropleth map for average black student percentage across districts.
* Districts[05 Manhattan', '11 Bronx', '13 Brooklyn', '16 Brooklyn', '17 Brooklyn', '18 Brooklyn', '19 Brooklyn', '22 Brooklyn', '23 Brooklyn', '29 Queens']  have higher Black students percentage on average.

In [None]:
map_column_data("Percent Black","stamen terrain","YlOrRd")

# Hispanic Students Percentage
* Choropleth map for average hispanic student percentage across districts.
* Districts ['01 Manhattan', '04 Manhattan', '06 Manhattan', '07 Bronx', '08 Bronx', '09 Bronx', '10 Bronx', '12 Bronx', '14 Brooklyn', '24 Queens', '30 Queens', '32 Brooklyn'] have higher Hispanic student percentage.

In [None]:
map_column_data("Percent Hispanic","stamen terrain","YlOrRd")

# White Students Percentage
* Choropleth map for average white student percentage across districts.
* Districts ['02 Manhattan', '31 Richmond'] have higher white students percentage on average.

In [None]:
map_column_data("Percent White","stamen terrain","YlOrRd")

## School count for maximum ethnicity percentage
* Extracted new column label for unique school with dominant ethnicity percentage

In [None]:
cols1 = ['Percent Asian', 'Percent Black', 'Percent Hispanic', 'Percent White']
cols2 = ["School Name" , "Latitude" ,"Longitude","District",
         "Economic Need Index","School Income Estimate","Average Math Proficiency",
         "Average ELA Proficiency"]
x = schools[cols1+cols2]
x["max"] = x[cols1].apply(max ,axis =1)
x

# Extracting maximum ethnicity percent of a school and creating new label
def lab(x) :
    if x["max"] == x["Percent Hispanic"] :
        return "Hispanic"
    elif x["max"] == x["Percent Asian"] :
        return "Asian"
    elif x["max"] == x["Percent Black"] :
        return "Black"
    elif x["max"] == x["Percent White"] :
        return "White"
x["eth_label"] = x.apply(lambda x:lab(x),axis =1)

plt.figure(figsize=(7,5))
ax = sns.countplot(y = x["eth_label"],order = x["eth_label"].value_counts().index,
                 linewidth = 1 ,edgecolor = "k" * x["eth_label"].nunique())
for i,j in enumerate(x["eth_label"].value_counts().values):
    ax.text(.7,i,j,fontsize = 13)
plt.grid(True)
plt.title("School count for maximum ethnicity percentage")
plt.show()


### Average Economic Need Index and School Income Estimate by Ethnicity types
* Schools with maximum black and hispanic percentage have high Economic need index and low school income estimates.

In [None]:
mn_es = x.groupby("eth_label")[["Economic Need Index","School Income Estimate"]].mean().reset_index()
esi = ["Economic Need Index","School Income Estimate"]

plt.figure(figsize = (8,12) )
for i,j in itertools.zip_longest(esi,range(len(esi))):
    plt.subplot(2,1,j+1)
    sns.barplot(mn_es["eth_label"],mn_es[i],
               linewidth = 1,edgecolor = "k" *len(mn_es),
               palette = "gist_earth")
    plt.grid(True)
    plt.title("Average " + i + " by Ethnicity types")

## Income estimates and Economic need index for ethnicity types
* Regression plot for income estimate and ENI for ethnicity types

In [None]:
eth = x["eth_label"].unique().tolist()
cs = ["b","r","g","k"]
plt.figure(figsize=(13,12))
for i,j,k in itertools.zip_longest(eth,range(len(eth)),cs):
    plt.subplot(2,2,j+1)
    sns.regplot(x[x["eth_label"] == i]["Economic Need Index"],
                x[x["eth_label"] == i]["School Income Estimate"],
                color = k ,fit_reg = True)
    plt.title(i)
    

## Ethnicity Dominant schools

In [None]:
def mark(ethnicity,color) :
    tracer = go.Scatter(y=x[x["eth_label"] == ethnicity]["Latitude"],
                       x=x[x["eth_label"] == ethnicity]["Longitude"],
                       mode = "markers",
                       text = x[x["eth_label"] == ethnicity]["School Name"],
                       marker = dict(line = dict(width = 1),
                                     size = 10,
                                     color = color
                                    ),
                        name = ethnicity+ " Dominant",
                      )
    return tracer

trace  = mark("Black","blue")
trace1 = mark("Hispanic","red")
trace2 = mark("Asian","green")
trace3 = mark("White","grey")

layout = go.Layout(dict(title = "Ethnicity Dominant schools",
                        xaxis = dict(title = "Longitude"),
                        yaxis = dict(title = "Latitude"),
                       ),
                  )

data = [trace,trace1,trace2,trace3]
fig = go.Figure(data = data ,layout=layout)

py.iplot(fig)

In [None]:
#Function to get top and bottom schools based on column

def bar_plot(column,title,data,sort,palette):
    
    if sort == "top" :
        dat = data.sort_values(by = column ,ascending=False)[["School Name",column,"City"]][:20]
    
    elif sort == "bottom" :
        dat = data.sort_values(by = column ,ascending=True)[["School Name",column,"City"]][:20]

    plt.figure(figsize = (6,10))
    ax = sns.barplot(column,"School Name",
                     palette = palette ,
                     data = dat,
                     linewidth = 1,
                     edgecolor = "k"*len(dat))
    for i,j in enumerate(("percent : " + np.around(dat[column],0).astype(str) + 
                          " , City : " +dat["City"]+".")):
        ax.text(.1,i,j,fontsize =12)
    plt.grid(True)    
    plt.title(title)
    plt.show()


# Schools with highest Student Attendance Rate

In [None]:
bar_plot("Student Attendance Rate","Schools with highest Student Attendance Rate",schools,"top","gist_ncar")

# Student Attendance Rate by location
As you can see almost every school has Student attendance rate greater than 80%

In [None]:
#'Percent of Students Chronically Absent', 'Rigorous Instruction %',
 #      'Collaborative Teachers %', 'Supportive Environment %',
  #     'Strong Family-Community Ties %', 'Trust %'

#Function to get location values for a column

def scatter_location_plot(column,palette,title):
    
    trace = go.Scatter(x = schools["Longitude"],
                       y = schools["Latitude"],
                       mode = "markers",
                       marker = dict(color = schools[column],
                                      colorscale = palette,
                                      size  = 12,
                                      line = dict(width = 1,color ="black"),
                                      showscale = True,
                                      colorbar = dict(title = column,
                                                      titleside = "right"),
                                     ),
                       text = schools["School Name"]+ " ,percent : " + schools[column].astype(str),
                      )
    data = [trace]
    
    layout = go.Layout(dict(title = title,
                            xaxis = dict(title = "Longitude"),
                            yaxis = dict(title = "Latitude"),
                            autosize = False ,
                            height = 600,
                            width  = 780,
                           ),
                      )
    
    
    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig)
    plt.show()
scatter_location_plot('Student Attendance Rate',"Rainbow","Student Attendance Rate by location")

# School Performance Attributes by districts
* Rigorous Instruction: This rating reflects the degree to which curriculum and
instruction are designed to engage students, foster critical-thinking skills, and are
aligned to the Common Core.

* Collaborative Teachers: This rating reflects the degree to which teachers
participate in opportunities to develop, grow, and contribute to the continuous
improvement of the school community. 

* Supportive Environment: This rating reflects the degree to which the school
establishes a culture where students feel safe, challenged to grow, and
supported to meet high expectations. This section draws upon data from the
Quality Review, the NYC School Survey, percentage of students with attendance
rates of 90% or higher, and movement of students with disabilities to less
restrictive environments.

* Effective School Leadership: This rating reflects the degree to which school
leadership inspires the school community with a clear instructional vision and
effectively distributes leadership to realize this vision. 

* Strong Family-Community Ties: This rating reflects the degree to which the
school forms effective partnerships with families to improve the school. 

* Trust: This rating reflects the degree to which relationships between
administrators, educators, students, and families are based on trust and respect.

* Student Achievement: This section rating reflects a school’s state test results,
how students performed in core courses, how well students were prepared for
their next level of school, and how students in higher-need groups performed.


In [None]:

trace1 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Rigorous Instruction %"],
                   mode = "lines+markers",
                   name = "Rigorous Instruction",
                   line = dict(color =  "blue" ,
                              width  = 1 ),
                  )
trace2 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Collaborative Teachers %"],
                   mode = "lines+markers",
                   name = "Collaborative Teachers",
                   line = dict(color = "red",
                              width  = 1 ),
                  )
trace3 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Supportive Environment %"],
                   mode = "lines+markers",
                   name = "Supportive Environment",
                   line = dict(color = "green",
                              width  = 1 ),
                  )

trace4 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Strong Family-Community Ties %"],
                   mode = "lines+markers",
                   name = "Strong Family-Community Ties",
                   line = dict(color = "grey",
                              width  = 1 ),
                  )

trace5 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Trust %"],
                   mode = "lines+markers",
                   name = "Trust",
                   line = dict(color = "black",
                              width  = 1 ),
                  )

trace6 = go.Scatter(x = percent_dist["JURISDICTION NAME"],
                   y = percent_dist["Effective School Leadership %"],
                   mode = "lines+markers",
                   name = "School Leadership",
                   line = dict(color = "brown",
                              width  = 1 ),
                  )

data = [trace1,trace2,trace3,trace4,trace5,trace6]

layout = go.Layout(dict(paper_bgcolor='rgb(255,255,255)',
                        plot_bgcolor='rgb(229,229,229)',
                        title = "School Performance Attributes by districts",
                        yaxis = dict(title = "percent"), 
                       )
                  )
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

# Frequency distribution of schools for performance targets
* These section ratings are presented on a four-level scale. In the School Quality
Guide, the four levels are called Exceeding Target, Meeting Target, Approaching
Target, and Not Meeting Target. In the School Quality Snapshots, the four levels
are called Excellent, Good, Fair, and Poor.



In [None]:
x1 = schools['Rigorous Instruction Rating'].value_counts().reset_index()
x2 = schools['Collaborative Teachers Rating'].value_counts().reset_index()
x3 = schools['Supportive Environment Rating'].value_counts().reset_index()
x4 = schools['Effective School Leadership Rating'].value_counts().reset_index()
x5 = schools['Strong Family-Community Ties Rating'].value_counts().reset_index()
x6 = schools['Trust Rating'].value_counts().reset_index()
x7 = schools['Student Achievement Rating'].value_counts().reset_index()

trace1 = go.Bar(x = x1["index"],
                y = x1["Rigorous Instruction Rating"],
                name = "Rigorous Instruction",
                marker = dict(line = dict(width =1 ),
                             ),
               )

trace2 = go.Bar(x = x2["index"],
                y = x2["Collaborative Teachers Rating"],
                name = "Collaborative Teachers",
                marker = dict(line = dict(width =1 ),
                             ),
               )

trace3 = go.Bar(x = x3["index"],
                y = x3["Supportive Environment Rating"],
                name = "Supportive Environment",
                marker = dict(line = dict(width =1 ),
                             ),
                )

trace4 = go.Bar(x = x4["index"],
                y = x4["Effective School Leadership Rating"],
                name = "School Leadership",
                marker = dict(line = dict(width =1 ),
                             ),
                )

trace5 = go.Bar(x = x5["index"],
                y = x5["Strong Family-Community Ties Rating"],
                name = "Family-Community Ties",
                marker = dict(line = dict(width =1 ),
                             ),
                )

trace6 = go.Bar(x = x6["index"],
                y = x6["Trust Rating"],
                name = "Trust",
                marker = dict(line = dict(width =1 ),
                             ),
               )

trace7 = go.Bar(x = x7["index"],
                y = x7["Student Achievement Rating"],
                name = "Student Achievement",
                marker = dict(line = dict(width =1 ),
                             ),
               )

data = [trace1,trace2,trace3,trace4,trace5,trace6,trace7]

layout = go.Layout(dict(title = "Frequency distribution of schools for performance targets",
                        autosize = False,
                        height = 500,
                        width  = 780,
                        xaxis  = dict(title = "Target",showgrid =True),
                        yaxis  = dict(title = "Frequency count"),
                       ),
                  )

fig = go.Figure(data=data,layout = layout)

py.iplot(fig)

# Correlation between school performance features
*  Trust is strongly correlated with  Effective School Leadership & Collaborative Teachers with pearson correlation coefficient greater than +0.85
*  Effective School Leadership is highly correlated to  Collaborative Teachers with coefficient +0.9

In [None]:

cols1 = ['Student Attendance Rate',
 'Percent of Students Chronically Absent',
 'Rigorous Instruction %',
 'Collaborative Teachers %',
 'Supportive Environment %',
 'Strong Family-Community Ties %',
 'Effective School Leadership %',
 'Trust %']


correlation = schools[cols1].corr()
mask = np.zeros_like(correlation)
mask[np.triu_indices_from(mask)]  =True

plt.figure(figsize=(10,8))

with sns.axes_style("white") :
    ax = sns.heatmap(correlation,annot=True,center=True,mask=mask,
                    linewidths=1)
plt.title("Correlation between school performance features")
plt.show()

# ELA (English Language Arts)  & Math Proficiency 
Understanding Proficiency provides resources that guide educators in analyzing student work on performance tasks in order to develop a deeper understanding of the English Language Arts (ELA)/Literacy and Mathematics  Common Core State Standards.Smarter Balanced ELA/Literacy  and Mathperformance tasks allow the evaluation of higher order skills.

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(schools[schools["Average ELA Proficiency"].notnull()]["Average ELA Proficiency"],
            shade=True,linewidth=3,label = "Average ELA Proficiency",color="grey")
plt.axvline(schools[schools["Average ELA Proficiency"].notnull()]["Average ELA Proficiency"].mean(),
          color = "grey" , label = "ELA Mean", linestyle = "dashed" )

sns.kdeplot(schools[schools["Average Math Proficiency"].notnull()]["Average Math Proficiency"],
            shade=True,linewidth=3,label = "Average Math Proficiency" ,color = "r")
plt.axvline(schools[schools["Average Math Proficiency"].notnull()]["Average Math Proficiency"].mean(),
           color = "r" , label = "Math Mean" , linestyle = "dashed")

plt.legend(loc = "best" , prop = {"size" : 12})
plt.title("Distribution of ELA & Math Proficiency" )
plt.show()

# ELA & Math Proficiency for schools
* Scatter plot between averge ELA and Math proficiency for schools.

In [None]:
trace = go.Scatter(x = schools[schools["Average ELA Proficiency"].notnull()]["Average ELA Proficiency"] ,
                   y = schools[schools["Average Math Proficiency"].notnull()]["Average Math Proficiency"],
                   mode = "markers",
                   marker = dict(color = schools["Economic Need Index"],
                                 colorscale = "Picnic",
                                 size = 10 ,
                                 line = dict(width = 1,
                                            color = "black"),
                                 showscale = True,
                                 colorbar = dict(title = "Economic Need Index",
                                                 titleside = "right"),
                                ),
                   text = schools["School Name"],
                  )
data = [trace]

layout = go.Layout(dict(title = "ELA & Math Proficiency for schools",
                        xaxis = dict(title = "Average ELA Proficiency"),
                        yaxis = dict(title = "Average Math Proficiency"),
                        autosize = False,
                        height = 700,
                        width  = 780,
                       )
                  )
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

## Average Math & ELA Proficiency by ethnicity dominant schools

In [None]:
eth_lab = x["eth_label"].unique().tolist()
length  = len(eth_lab)
cs = ["r","g","b","k"]

plt.figure(figsize=(12,11))
for i,j,k in itertools.zip_longest(eth_lab,range(length),cs) :
    plt.subplot(2,2,j+1)
    sns.regplot(x[x["eth_label"] == i ]["Average Math Proficiency"],
                x[x["eth_label"] == i ]["Average ELA Proficiency"] , color = k)
    plt.title(i + " Dominant schools ")
    

# Average ELA and Math proficiency by Districts

In [None]:

ela_df = schools[["JURISDICTION NAME","Average ELA Proficiency"]]
ela_df.columns = ["district","prof"]
ela_df["type"] = "ELA"

mat_df = schools[["JURISDICTION NAME","Average Math Proficiency"]]
mat_df.columns = ["district","prof"]
mat_df["type"] = "MATH"

df_x = pd.concat([ela_df,mat_df],axis = 0)

plt.figure(figsize=(8, 18))
sns.violinplot(y = df_x["district"],x = df_x["prof"],
               inner = "quart",
               palette = "Set1",
               hue=df_x["type"],split=True)
plt.xticks(rotation = 90)
plt.grid(True)
plt.legend(loc = "best" , prop = {"size" : 13})
plt.yticks(fontsize = 12)
plt.xlabel("Average Proficiency" ,fontsize = 12)
plt.ylabel("District" ,fontsize = 12)
plt.title("Average ELA and Math proficiency by Districts")
plt.show()

# Comparing Mean ELA and Math Proficiency for districts

In [None]:
prof_dist = schools.groupby("JURISDICTION NAME")[["Average ELA Proficiency","Average Math Proficiency"]].mean().reset_index()

trace1 = go.Bar(x = prof_dist["JURISDICTION NAME"],
                y = prof_dist["Average ELA Proficiency"],
                marker = dict(line = dict(width = 1,color="black"),
                             ),
                name = "ELA",
               ) 

trace2 = go.Bar(x = prof_dist["JURISDICTION NAME"],
                y = prof_dist["Average Math Proficiency"],
                marker = dict(line = dict(width = 1,color="black"),
                             ),
                name = "MATH",
               ) 

data = [trace1,trace2]

layout = go.Layout(dict(title = "Mean ELA and Math Proficiency for districts",
                        barmode = "stack",
                        xaxis = dict(showgrid = True),
                        yaxis = dict(title = "Mean Proficiency"),
                        autosize = False,
                        height = 500 ,
                        width  = 780 ,
                        ),
                  )
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Grade Wise ELA and MATH Proficiency
* comparing total students attempted to the students who scored 4's in ELA and Math
*  the scale scores on State Common Core math and ELA exams are assigned a Proficiency Rating ranging from 1.00
to 4.50. 

In [None]:
#separating grade wise data
grade_three = schools[schools.columns[schools.columns.str.contains("Grade 3")]]
grade_four  = schools[schools.columns[schools.columns.str.contains("Grade 4")]]
grade_five  = schools[schools.columns[schools.columns.str.contains("Grade 5")]]
grade_six   = schools[schools.columns[schools.columns.str.contains("Grade 6")]]
grade_seven = schools[schools.columns[schools.columns.str.contains("Grade 7")]]
grade_eight = schools[schools.columns[schools.columns.str.contains("Grade 8")]]


In [None]:
def pie_plot(grade,lab1,val1,lab2,val2):
    
    trace1 = go.Pie(labels = lab1,
                   values  = val1,
                   hoverinfo = "label+percent+name",
                   domain = dict(x = [0,.48]),
                   name   = "ELA" + grade , 
                   hole   = .45,
                   marker = dict(colors = ['orange','lightgrey'],
                                 line   = dict(width = 1 ,color = "black"),
                                ),
                  )
    
    trace2 = go.Pie(labels = lab2,
                    values = val2,
                    hoverinfo = "label+percent+name",
                    domain = dict(x = [.54,1]),
                    name   = "MATH" + grade ,
                    hole   = .45 , 
                    marker = dict(colors = [ 'skyblue', 'gold'],
                                  line   = dict(width = 1 ,color = "black"),
                                 ),
                   )
    
    layout  = go.Layout(dict(title = "Overall Grade" + grade + "ELA & Math  Proficiency",
                             annotations = [
                                 dict(text = "ELA" + grade,
                                      showarrow = False, 
                                      font = dict(size = 15),
                                      x = .2,
                                      y = .5,
                                     ),
                                 dict(text = "MATH" + grade,
                                      showarrow = False,
                                      font = dict(size = 15),
                                      x = .83,
                                      y = .5,
                                     )
                             ]
                            )
                        
                       )
    
    
    data = [trace1,trace2]
    
    fig  = go.Figure(data= data,layout = layout)
    py.iplot(fig)
    plt.show()

In [None]:

def ela_math_fours(grade):
    
    ela_cols  = schools.columns[schools.columns.str.contains("Grade " + grade + " ELA 4s")][1:]
    math_cols = schools.columns[schools.columns.str.contains("Grade " + grade + " Math 4s")][1:]
    
    ela  = schools[ela_cols].sum().reset_index()
    ela.columns = ["ethnicity","count"]
    ela["ethnicity"] = ela["ethnicity"].str.split("-").str[1]
    ela = ela.sort_values(by = "count",ascending =True)
    
    math = schools[math_cols].sum().reset_index()
    math.columns = ["ethnicity","count"]
    math["ethnicity"] = math["ethnicity"].str.split("-").str[1]
    math = math.sort_values(by = "count",ascending = True)
    
    trace1 = go.Bar(x  = ela["count"],
                    y  = ela["ethnicity"],
                    orientation = "h",
                    name = "ELA",
                    marker = dict(color = 'rgba(58, 71, 80, 0.6)',
                                  line = dict(width = 2 ,
                                              color = 'rgba(58, 71, 80, 0.6)'),
                                 ),
                   )
    
    trace2 = go.Bar(x  = math["count"],
                    y  = math["ethnicity"],
                    orientation = "h",
                    name = "MATH",
                    marker = dict(color = 'rgba(246, 78, 139, 1.0)',
                                  line = dict(width = 1 ,
                                              color = 'rgba(246, 78, 139, 1.0)'),
                                 ),                    
                   )
    
    data = [trace1,trace2]
    
    layout = go.Layout(dict(title  = "ELA & MATH 4's by Ethnicity Types for Grade " + grade,
                            xaxis  = dict(showgrid = True,
                                          title = "count",
                                          ticks = "outside",
                                          tick0 = 0,
                                         ),
                            yaxis  = dict(showgrid = True,
                                          ticks = "outside", 
                                          tick0 = 5,
                                          ticklen = 18,
                                          mirror = "ticks",
                                          tickfont = dict(size =12),
                                         ),
                            margin = dict(l=250,t = 100 ,r = 250 , b =100)
                           )
                      )
    
    fig = go.Figure(data= data,layout=layout)
    
    py.iplot(fig)

# Grade 3

In [None]:
lab = grade_three[["Grade 3 ELA - All Students Tested","Grade 3 ELA 4s - All Students"]].sum().keys()
val = grade_three[["Grade 3 ELA - All Students Tested","Grade 3 ELA 4s - All Students"]].sum().values


lab1 = grade_three[["Grade 3 Math - All Students tested","Grade 3 Math 4s - All Students"]].sum().keys()
val1 = grade_three[["Grade 3 Math - All Students tested","Grade 3 Math 4s - All Students"]].sum().values

pie_plot(" 3 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("3")

# Grade 4 

In [None]:
lab = grade_four[["Grade 4 ELA - All Students Tested","Grade 4 ELA 4s - All Students"]].sum().keys()
val = grade_four[["Grade 4 ELA - All Students Tested","Grade 4 ELA 4s - All Students"]].sum().values


lab1 = grade_four[["Grade 4 Math - All Students Tested","Grade 4 Math 4s - All Students"]].sum().keys()
val1 = grade_four[["Grade 4 Math - All Students Tested","Grade 4 Math 4s - All Students"]].sum().values

pie_plot(" 4 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("4")

# Grade 5

In [None]:
lab = grade_five[["Grade 5 ELA - All Students Tested","Grade 5 ELA 4s - All Students"]].sum().keys()
val = grade_five[["Grade 5 ELA - All Students Tested","Grade 5 ELA 4s - All Students"]].sum().values


lab1 = grade_five[["Grade 5 Math - All Students Tested","Grade 5 Math 4s - All Students"]].sum().keys()
val1 = grade_five[["Grade 5 Math - All Students Tested","Grade 5 Math 4s - All Students"]].sum().values

pie_plot(" 5 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("5")

# Grade 6

In [None]:
lab = grade_six[["Grade 6 ELA - All Students Tested","Grade 6 ELA 4s - All Students"]].sum().keys()
val = grade_six[["Grade 6 ELA - All Students Tested","Grade 6 ELA 4s - All Students"]].sum().values


lab1 = grade_six[["Grade 6 Math - All Students Tested","Grade 6 Math 4s - All Students"]].sum().keys()
val1 = grade_six[["Grade 6 Math - All Students Tested","Grade 6 Math 4s - All Students"]].sum().values

pie_plot(" 6 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("6")

# Grade 7

In [None]:
lab = grade_seven[["Grade 7 ELA - All Students Tested","Grade 7 ELA 4s - All Students"]].sum().keys()
val = grade_seven[["Grade 7 ELA - All Students Tested","Grade 7 ELA 4s - All Students"]].sum().values


lab1 = grade_seven[["Grade 7 Math - All Students Tested","Grade 7 Math 4s - All Students"]].sum().keys()
val1 = grade_seven[["Grade 7 Math - All Students Tested","Grade 7 Math 4s - All Students"]].sum().values

pie_plot(" 7 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("7")

# Grade 8

In [None]:
lab = grade_eight[["Grade 8 ELA - All Students Tested","Grade 8 ELA 4s - All Students"]].sum().keys()
val = grade_eight[["Grade 8 ELA - All Students Tested","Grade 8 ELA 4s - All Students"]].sum().values


lab1 = grade_eight[["Grade 8 Math - All Students Tested","Grade 8 Math 4s - All Students"]].sum().keys()
val1 = grade_eight[["Grade 8 Math - All Students Tested","Grade 8 Math 4s - All Students"]].sum().values

pie_plot(" 8 ",lab,val,lab1,val1)

In [None]:
ela_math_fours("8")

# overall Students(all grades) scored 4's by student types

In [None]:
cols = [ 'American Indian or Alaska Native',
       'Black or African American',
       'Hispanic or Latino',
       'Asian or Pacific Islander', 'White',
       'Multiracial',
       'Limited English Proficient',
       'Economically Disadvantaged']

pattern = "|".join(cols)

selected_cols = schools.columns[schools.columns.str.contains(pattern,case=False)][1:]

new_df = schools[selected_cols][selected_cols].sum().reset_index()

new_df.columns  = ["types","count"]

new_df_ela = new_df[new_df["types"].str.contains("ELA")]
new_df_mat = new_df[new_df["types"].str.contains("Math")]

new_df_ela["types"] = new_df_ela["types"].str.split("-").str[1]
new_df_ela  = new_df_ela.groupby("types")["count"].sum().reset_index()
new_df_ela["sub"]   = "ELA"
new_df_ela = new_df_ela.sort_values(by = "count" ,ascending=True)

new_df_mat["types"] = new_df_mat["types"].str.split("-").str[1]
new_df_mat  = new_df_mat.groupby("types")["count"].sum().reset_index()
new_df_mat["sub"]   = "MATH"
new_df_mat = new_df_mat.sort_values(by = "count" ,ascending=True)

em_df = pd.concat([new_df_ela,new_df_mat],axis=0)
em_df


trace1 = go.Bar(x = new_df_ela["count"],
                y = new_df_ela["types"],
                orientation = "h",
                name = "ELA",
                marker = dict(line = dict(width =1,
                                         color = "white"),
                             ),
               )

trace2 = go.Bar(x = new_df_mat["count"],
                y = new_df_mat["types"],
                orientation = "h",
                name = "MATH",
                marker = dict(line = dict(width = 1,
                                          color = "white"),
                             ),
               )

layout = go.Layout(dict(title = "overall Students(all grades) scored 4's by student types",
                        barmode = "stack",
                        margin  = dict(r=250,l=250,t=100,b=100),
                        xaxis   = dict(title = "count",
                                       showgrid = True,
                                      ),
                        yaxis  =  dict (showgrid = True),
                        height = 600,
                       )
                  )

data = [trace1,trace2]

fig = go.Figure(data=data,layout=layout)

py.iplot(fig)

# K - Means Clustering
* k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.

## Data Preparartion

In [None]:
analysis_data = schools.copy()

#columns for data
cols  = ["District",'Economic Need Index','School Income Estimate']
#
percent_cols = analysis_data.columns[analysis_data.columns.str.contains("%")]
ethnic_cols  = analysis_data.columns[analysis_data.columns.str.contains("Percent")]
ela_cols     = analysis_data.columns[analysis_data.columns.str.contains("ELA")]
mat_cols     = analysis_data.columns[analysis_data.columns.str.contains("Math")]


In [None]:
#selecting columns for data
columns = analysis_data.columns[analysis_data.columns.isin(percent_cols|cols|ethnic_cols)]

analysis_data = analysis_data[columns]

#misssing value columns
missing_val_cols = analysis_data.isnull().sum()[analysis_data.isnull().sum() > 0].keys()


#Filling missing values with mean value in the district
for i in missing_val_cols:
    analysis_data[i] = analysis_data[[i]].fillna(analysis_data.groupby("District").transform("mean"))

## Finding Optimal number of clusters

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#Scaling values of data usong standard scalar
scaled = analysis_data.copy()

std = StandardScaler()
scaled[columns] = std.fit_transform(scaled[columns])

#Getting inertia for clusters 1 to 20
clust_range  = range(1,20)
clust_errors = []

for i in clust_range:
    cluster = KMeans(i)
    cluster.fit_predict(scaled)
    clust_errors.append(cluster.inertia_)

cluster_df = pd.DataFrame({"cluster_number":clust_range,"error":clust_errors})


## Elbow Plot
* Plotting the cluster number and their respective inertia.

In [None]:
trace1 = go.Scatter(x = cluster_df["cluster_number"],
                    y = cluster_df["error"],
                    mode = "markers+lines",
                    marker = dict(color = "orange",
                                  size  = 13,
                                  line  = dict(width =1,color="black") 
                                 ),
                    line = dict(color = "black",
                                width  = 2
                               ),
                   )
data = [trace1]

layout = go.Layout(dict(title = "ELBOW PLOT",
                        height = 550,
                        width  = 780,
                        xaxis  = dict(title = "clusters"),
                       ),
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

## Silhouette analysis :
* Silhouette analysis can be used to study the separation distance between the resulting clusters.The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters and thus provides a way to assess parameters  like number of clusters visually. This measure has a range of [-1, 1].

* Silhouette coefficients (as these values are referred to as) near +1 indicate
 that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on 
 or very close to the decision boundary between two neighboring clusters .

* silhouette analysis is used to choose an optimal value for n_clusters.selecting clusters with above average silhouette scores and with less fluctuations. selecting 4 clusters

In [None]:
##Silhouette analysis can be used to study the separation distance between the resulting clusters.
# The silhouette plot displays a measure of 
# how close each point in one cluster is to points in the neighboring clusters and thus provides
# a way to assess parameters 
# like number of clusters visually. This measure has a range of [-1, 1].

# Silhouette coefficients (as these values are referred to as) near +1 indicate
# that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on 
# or very close to the decision boundary between two neighboring clusters .


# silhouette analysis is used to choose an optimal value for n_clusters.
# selecting clusters with above average silhouette scores and with less fluctuations.
# selecting 4 clusters


from sklearn.metrics import silhouette_samples,silhouette_score
import matplotlib.cm as cm

figures = []
range_clusters = range(2,6)
scaled_df = scaled.copy()
scaled_df = scaled_df.drop("District",axis= 1)
for n_clusters in range_clusters:

    fig = tls.make_subplots(rows=1, cols=2,
                              print_grid=False)

    fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                                   range=[-0.1, 1])
   
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(scaled_df) + (n_clusters + 1) * 10])

    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(scaled_df)

    silhouette_avg = silhouette_score(scaled_df, cluster_labels)
    print("For clusters : ", n_clusters,
          "The average silhouette_score  :", silhouette_avg)


    sample_silhouette_values = silhouette_samples(scaled_df, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):

        cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        cluster_silhouette_values.sort()

        size_cluster_i = cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

       # colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        
        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5),
                                 fill='tozerox')
        
        fig.append_trace(filled_area, 1, 1)

        y_lower = y_upper + 10  
        

    axis_line = go.Scatter(x=[silhouette_avg],
                           y=[0, len(scaled_df) + (n_clusters + 1) * 10],
                           showlegend=False,
                           mode='lines',
                           line=dict(color="red", dash='dash',
                                     width =1) )

    fig.append_trace(axis_line, 1, 1)


    clusters = go.Scatter(x=scaled_df.iloc[:, 0], 
                          y=scaled_df.iloc[:, 1], 
                          showlegend=False,
                          mode='markers',
                          marker=dict(color="red",
                                     size=4)
                         )
    fig.append_trace(clusters, 1, 2)
    
 
    centers_ = clusterer.cluster_centers_
   
    centers = go.Scatter(x=centers_[:, 0], 
                         y=centers_[:, 1],
                         showlegend=False,
                         mode='markers',
                         marker=dict(color='blue', size=10,
                                     line=dict(color='black',
                                                             width=1))
                        )

    fig.append_trace(centers, 1, 2)
    
    fig['layout']['xaxis2'].update(title='Feature space 1',
                                   zeroline=False)
    fig['layout']['yaxis2'].update(title='Feature space 2',
                                  zeroline=False)
                              

    fig['layout'].update(title="Silhouette analysis for KMeans clustering "
                         "with clusters = %d" % n_clusters)
    
    figures.append(fig)

In [None]:
py.iplot(figures[3])

In [None]:
py.iplot(figures[2])

## Clustering 
Selecting four clusters.

In [None]:
#Selecting four cluster
km = KMeans(n_clusters = 4)


#concatinating cluster number to data frame
scaled["clusters"] = km.fit_predict(scaled)


analysis_pca = scaled.copy()

from sklearn.preprocessing import StandardScaler
#Normalizing values
std = StandardScaler()
analysis_pca[columns] = std.fit_transform(analysis_data[columns])

#Breaking data into three principal components
pca = PCA(n_components = 3)
analysis_pca["principal_Component_one"] = pca.fit_transform(analysis_pca[columns])[:,0]
analysis_pca["principal_Component_two"] = pca.fit_transform(analysis_pca[columns])[:,1]
analysis_pca["principal_Component_three"] = pca.fit_transform(analysis_pca[columns])[:,2]
    

# Visualizing clusters in three dimensional space

In [None]:
trace = go.Scatter3d(x = analysis_pca[analysis_pca["clusters"] == 0]["principal_Component_one"],
                     y =  analysis_pca[analysis_pca["clusters"] == 0]["principal_Component_two"],
                     z = analysis_pca[analysis_pca["clusters"] == 0]["principal_Component_three"],
                     mode = "markers",
                     marker = dict(color = "rgb(127, 255, 0)",
                                   size = 5,
                                   line = dict(width = .01,
                                               color = "black"),
                                  ),
                     name = "CLUSTER 0",
                    )

trace1 = go.Scatter3d(x = analysis_pca[analysis_pca["clusters"] == 1]["principal_Component_one"],
                     y =  analysis_pca[analysis_pca["clusters"] == 1]["principal_Component_two"],
                     z = analysis_pca[analysis_pca["clusters"] == 1]["principal_Component_three"],
                     mode = "markers",
                     marker = dict(color = "rgb(123, 104, 238)",
                                   size =5,
                                   line = dict(width = .01,
                                               color = "black"),
                                  ),
                    name = "CLUSTER 1",
                    )

trace2 = go.Scatter3d(x = analysis_pca[analysis_pca["clusters"] == 2]["principal_Component_one"],
                     y =  analysis_pca[analysis_pca["clusters"] == 2]["principal_Component_two"],
                     z = analysis_pca[analysis_pca["clusters"] == 2]["principal_Component_three"],
                     mode = "markers",
                     marker = dict(color = "rgb(255, 69, 0)",
                                   size =5,
                                   line = dict(width = .01,
                                               color = "black"),
                                  ),
                     name = "CLUSTER 2", 
                    )
trace3 = go.Scatter3d(x = analysis_pca[analysis_pca["clusters"] == 3]["principal_Component_one"],
                     y =  analysis_pca[analysis_pca["clusters"] == 3]["principal_Component_two"],
                     z = analysis_pca[analysis_pca["clusters"] == 3]["principal_Component_three"],
                     mode = "markers",
                     marker = dict(color = "rgb(255, 0, 255)",
                                   size =5,
                                   line = dict(width = .01,
                                               color = "black"),
                                  ),
                     name = "CLUSTER 3", 
                    )


data = [trace,trace1,trace2,trace3]
layout = go.Layout(dict(title = "Visualizing Clusters",
                        scene = dict(xaxis = dict(title = "PC 1",
                                                  gridcolor='rgb(255, 255, 255)',
                                                  zerolinecolor='rgb(255, 255, 255)',
                                                  showbackground=True,
                                                  backgroundcolor='rgb(230, 230,230)'),
                                    yaxis = dict(title = "PC 2",
                                                 gridcolor='rgb(255, 255, 255)',
                                                 zerolinecolor='rgb(255, 255, 255)',
                                                 showbackground=True,
                                                 backgroundcolor='rgb(230, 230,230)'
                                                ),
                                    zaxis = dict(title = "PC 3",
                                                gridcolor='rgb(255, 255, 255)',
                                                zerolinecolor='rgb(255, 255, 255)',
                                                showbackground=True,
                                                backgroundcolor='rgb(230, 230,230)'
                                                ),
                                   camera=dict(up=dict(
                                                       x=0,
                                                       y=0,
                                                       z=1
                                                      ),
                                               eye=dict(
                                                         x=-1.7428,
                                                         y=1.0707,
                                                         z=0.7100,
                                                       )
                                               ),
                                  aspectratio = dict( x=1, y=1, z=0.7 ),
                                  aspectmode = 'manual'),
                        autosize = False,
                        height = 800,
                        width  = 800,
                       ),
                  )

fig = go.Figure(data = data,layout = layout)
py.iplot(fig)

# Analysing Cluster Components :

# Number of schools for each cluster

In [None]:
#concat cluster to original data
cluster_schools = schools.copy()
cluster_schools["cluster"]  =  "CLUSTER - " + scaled["clusters"].astype(str)

cl = ['CLUSTER - 0', 'CLUSTER - 1', 'CLUSTER - 2', 'CLUSTER - 3']
plt.figure(figsize=(8,6))
ax = sns.countplot( y = cluster_schools["cluster"],linewidth = 1 ,
                   order = cluster_schools["cluster"].value_counts().index,
                   edgecolor = "k"*cluster_schools["cluster"].nunique(),
                   palette = "husl")
plt.grid(True)

for i,j in enumerate(cluster_schools["cluster"].value_counts()):
    ax.text(.7,i,j,fontsize = 20)
    
plt.title("Count of schools for each cluster")
plt.show()

#  school location by Clusters
 * latitude and longitude location of schools for each cluster

In [None]:
trace = go.Scatter(x = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 0"]["Longitude"],
                   y = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 0"]["Latitude"],
                   mode = "markers",
                   marker = dict(color = "rgb(127, 255, 0)",
                                 line = dict(width = 1,color ="black"),
                                 size  = 8),
                   name = "CLUSTER - 0",
                   text = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 0"]["School Name"],
                  )
trace1 = go.Scatter(x = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 1"]["Longitude"],
                   y = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 1"]["Latitude"],
                   mode = "markers",
                   marker = dict(color = "rgb(123, 104, 238)",
                                 line = dict(width = 1,color ="black"),
                                 size = 8),
                    name = "CLUSTER - 1",
                    text = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 1"]["School Name"],
                  )
trace2 = go.Scatter(x = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 2"]["Longitude"],
                   y = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 2"]["Latitude"],
                   mode = "markers",
                   marker = dict(color = "rgb(255, 69, 0)",
                                 line = dict(width = 1,color ="black"),
                                 size = 8),
                    name = "CLUSTER - 2",
                    text = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 2"]["School Name"],
                  )
trace3 = go.Scatter(x = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 3"]["Longitude"],
                   y = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 3"]["Latitude"],
                   mode = "markers",
                   marker = dict(color = "rgb(255, 0, 255)",
                                 line = dict(width = 1,color ="black"),
                                 size = 8),
                    name = "CLUSTER - 3",
                    text = cluster_schools[cluster_schools["cluster"] == "CLUSTER - 3"]["School Name"],
                  )

data = [trace,trace1,trace2,trace3]

layout = go.Layout(dict(title = "Clusters by school location ",
                        xaxis = dict(title = "Longitude"),
                        yaxis = dict(title = "Latitude"),
                        autosize = False,
                        height = 600,
                        width  = 780,
                       )
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
plt.show()

# Distribution of Economic Need Index in Each Cluster

In [None]:
cl = ['CLUSTER - 0', 'CLUSTER - 1', 'CLUSTER - 2', 'CLUSTER - 3']
length = len(cl)
cs = ["r","b","g","orange"]

plt.figure(figsize=(13,8))
for i,j,k in itertools.zip_longest(cl,range(length),cs):
    sns.kdeplot(cluster_schools[(cluster_schools["cluster"] == i) &
                       (cluster_schools["Economic Need Index"].notnull())]["Economic Need Index"],
               label = i,shade = True,linewidth = 2,color = k)
    plt.axvline(cluster_schools[(cluster_schools["cluster"] == i) &
                       (cluster_schools["Economic Need Index"].notnull())]["Economic Need Index"].mean(),
               color = k ,linestyle = "dashed" , label = i +  "  Mean")
    plt.legend(loc = "best",prop = {"size" : 12})
    plt.title("Distribution of Economic Need Index in Each Cluster")

# Distribution of School Income Estimate in Each Cluster
cluster with maximum average ENI also has least average school income estimates

In [None]:
cl = ['CLUSTER - 0', 'CLUSTER - 1', 'CLUSTER - 2', 'CLUSTER - 3']
length = len(cl)
cs = ["r","b","g","orange"]

plt.figure(figsize=(12,8))
for i,j,k in itertools.zip_longest(cl,range(length),cs):
    sns.kdeplot(cluster_schools[(cluster_schools["cluster"] == i) &
                       (cluster_schools["School Income Estimate"].notnull())]["School Income Estimate"],
               label = i,shade = True,linewidth = 2,color = k)
    plt.axvline(cluster_schools[(cluster_schools["cluster"] == i) &
                       (cluster_schools["School Income Estimate"].notnull())]["School Income Estimate"].mean(),
               color = k ,linestyle = "dashed" , label = i +  "  Mean")
    plt.legend(loc = "best",prop = {"size" : 12})
    plt.title("Distribution of School Income Estimate in Each Cluster")

# Percentage of community schools in each cluster

In [None]:
cl 
length = len(cl)

plt.figure(figsize=(12,12))
for i,j in itertools.zip_longest(cl,range(length)):
    plt.subplot(length/2,length/2,j+1)
    (cluster_schools[cluster_schools["cluster"] == i]['Community School?']
     .value_counts().plot.pie(autopct = "%1.0f%%",
                              colors  = ["grey","orange"],
                              wedgeprops = {"linewidth" : 1,
                                            "edgecolor" : "white"},
                              shadow = True,
                              startangle = 70,
                             )
    )
    
    circ = plt.Circle((0,0),.7,color = "white")
    plt.gca().add_artist(circ)
    plt.title(i)
    plt.ylabel("")


# Mean Student type percent by clusters
* Average student ethnicity percentage in each cluster.

In [None]:
cols = ['Percent Asian', 'Percent Black', 'Percent Hispanic', 'Percent White']
cluster_schools.groupby("cluster")[cols].mean().plot(kind="bar",stacked  = True,
                                                     figsize = (12,6),
                                                     width = .3, linewidth = 1,edgecolor = "k"*len(cols),
                                                     colors = sns.color_palette("husl"),alpha=.9)
plt.xticks(rotation = 0)
plt.title("Mean Student ethnicity percent by clusters")
plt.grid(True)
plt.show()

# Mean School performance attribute percent by clusters

In [None]:
percent_cols
cluster_schools.groupby("cluster")[percent_cols].mean().plot(kind = "bar",
                                                             figsize = (12,7),width = .5,
                                                             linewidth = 1,edgecolor = "k"*len(cols),
                                                             colors = sns.color_palette("husl"))
plt.xticks(rotation = 0)
plt.title("Mean School performance attribute percent by clusters")
plt.grid(True)
plt.show()

# Mean Percent of Students Chronically Absent by clusters

In [None]:
cluster_schools.groupby("cluster")[['Percent of Students Chronically Absent']].mean().plot(kind = "bar",
                                                                                           figsize = (12,6),
                                                                                           linewidth = 1,
                                                                                           edgecolor = "k"*len(cols),
                                                                                           color = "royalblue")
plt.xticks(rotation = 0)
plt.title("Mean Percent of Students Chronically Absent by clusters")
plt.show()

# Performance Targets by clusters

In [None]:
rate_cols = cluster_schools.columns[cluster_schools.columns.str.contains("Rating")]
length = len(rate_cols)
hue_ord = ['Meeting Target', 'Exceeding Target', 'Approaching Target',
       'Not Meeting Target']

plt.figure(figsize=(14,20))
for i,j in itertools.zip_longest(rate_cols,range(length)):
    
    plt.subplot(4,2,j+1)
    dat = pd.DataFrame(cluster_schools.groupby("cluster")[i].value_counts())
    dat = dat.rename(columns={i:"count"}).reset_index()
    sns.barplot(x = dat["cluster"] , y =dat["count"] 
                ,hue_order=hue_ord,
                hue = dat[i],
                linewidth = 1 ,
                edgecolor = "k" *len(rate_cols) , 
               palette = "Set1")
    plt.title(i,color= "b")
    plt.grid(True)
    plt.legend(loc = "top right" , prop = {"size" : 8})

# Identifying Schools
* Cluster with schools having high average Economic Need Index and low School income estimates. Schools within this cluster have low average percent on school performance attributes( 'Rigorous Instruction %', 'Collaborative Teachers %','Supportive Environment %', 'Effective School Leadership %','Strong Family-Community Ties %', 'Trust %') .
* These are the schools have high count for not meeting targets which are often reffered as poor performance by school qualitysnapshots and have highest percentage of community schools among all clusters. 


In [None]:
cl_ud = cluster_schools["cluster"].value_counts()[-1:].keys()
cl_sc = cluster_schools[cluster_schools["cluster"].isin(cl_ud)]
cl_sc["School Name"].unique().tolist()

# SHSAT (Specialized High School Test) data :
* District 5 (Central Harlem) SHSAT (Specialized High School Test) data from the NYC Department of Education.

* The Specialized High Schools Admissions Test (SHSAT) is an examination administered to eighth and ninth grade students
   residing in New York City and used to determine admission to all but one of the city's nine Specialized High Schools. 
   The test is given each year in October and November, and students are informed of their results the following March. 
   Those who receive offers decide by the middle of March whether to attend the school the following September. 
  The test is independently produced and graded by American Guidance Service,
  a subsidiary of Pearson Education, under contract to the New York City Department of Education.
 
*  The SHSAT assesses knowledge and skills. These skills consist of the ability to comprehend English prose, 
  to demonstrate understanding of revising and editing skills central to writing in English, and to use problem-solving skills 
  in mathematics. The test measures knowledge and skills students have gained over the course of their education.
  Keeping up with schoolwork throughout the year is the best possible preparation.


In [None]:
print ("Total Number of Schools : ", shsat["DBN"].nunique())

## Total Students who registered and took test from 2013 - 2016

In [None]:
tst = shsat.groupby("Year of SHST")[["Number of students who registered for the SHSAT","Number of students who took the SHSAT"]].sum().reset_index()

trace1 = go.Bar(x = tst["Year of SHST"],
                y = tst["Number of students who registered for the SHSAT"],
                marker = dict(line = dict(width = 2,color = "blue")
                             ),
                opacity = .8,
                name = "Registered Students",
               )

trace2 = go.Bar(x = tst["Year of SHST"],
                y = tst["Number of students who took the SHSAT"],
                marker = dict(line = dict(width = 2,color = "orange")
                             ),
                opacity = .8,
                name = "Test taken students",
               )

layout = go.Layout(dict(title = "Total Students who registered and took test from 2013 - 2016",
                        xaxis = dict(showgrid = True,
                                     title = "Year",
                                    ),
                         yaxis = dict(showgrid = True,
                                     title = "Count",
                                    ),
                        height = 500 ,
                        width  = 700 ,
                        paper_bgcolor = "rgb(230, 230,230)",
                        plot_bgcolor  = "rgb(230, 230,230)",
                       ),
                  )

data = [trace1,trace2]

fig = go.Figure(data = data , layout = layout)
py.iplot(fig)

## Grade wise Total Students who registered and took test

In [None]:
shsat["Grade level"] = "Grade " +shsat["Grade level"].astype(str)
gd_st = shsat.groupby("Grade level")[["Number of students who registered for the SHSAT","Number of students who took the SHSAT"]].sum().reset_index()

trace1 = go.Bar(x = gd_st["Grade level"],
                y = gd_st["Number of students who registered for the SHSAT"],
                marker = dict(color = "rgb(255, 51, 51)",
                              line = dict(width = 2,color = "grey")
                             ),
                opacity = .8,
                name = "Registered Students",
               )

trace2 = go.Bar(x = gd_st["Grade level"],
                y = gd_st["Number of students who took the SHSAT"],
                marker = dict(color = "rgb(102, 0, 255)",
                              line = dict(width = 2,color = "grey")
                             ),
                opacity = .8,
                name = "Test taken students",
               )

layout = go.Layout(dict(title = "Grade wise Total Students who registered and took test",
                        xaxis = dict(showgrid = True,
                                     title = "Grade",
                                    ),
                         yaxis = dict(showgrid = True,
                                     title = "Count",
                                    ),
                        height = 500,
                        width  = 600,
                        paper_bgcolor = "rgb(230, 230,230)",
                        plot_bgcolor  = "rgb(230, 230,230)",
                       ),
                  )


data = [trace1,trace2]
fig  = go.Figure(data = data,layout = layout)
py.iplot(fig)

# Yearly Grade wise Total Students who registered and took test

In [None]:
gd_yr = shsat.groupby(["Year of SHST","Grade level"])[["Number of students who registered for the SHSAT","Number of students who took the SHSAT"]].sum().reset_index()
trace1 = go.Bar(x = gd_yr[gd_yr["Grade level"] == "Grade 8"]["Year of SHST"],
                y = gd_yr[gd_yr["Grade level"] == "Grade 8"]["Number of students who registered for the SHSAT"],
                marker = dict(color = "rgb(0, 153, 255)",
                              line = dict(width = 2,color = "grey"),
                             ),
                opacity = .8,
                name = "Grade 8 - Registered Students",
               )
                
trace2 = go.Bar(x = gd_yr[gd_yr["Grade level"] == "Grade 9"]["Year of SHST"],
                y = gd_yr[gd_yr["Grade level"] == "Grade 9"]["Number of students who registered for the SHSAT"],
                marker = dict(color = "rgb(255, 0, 255)",
                              line = dict(width = 2,color = "grey"),
                             ),
                opacity = .8,
                name = "Grade 9 - Registered Students",
               )
                

trace3 = go.Bar(x = gd_yr[gd_yr["Grade level"] == "Grade 8"]["Year of SHST"],
                y = gd_yr[gd_yr["Grade level"] == "Grade 8"]["Number of students who took the SHSAT"],
                marker = dict(color = "rgb(255, 153, 102)",
                              line = dict(width = 2,color = "grey")
                             ),
                opacity = .8,
                name = "Grade 8 - Test taken Students",
               )
               

trace4 = go.Bar(x = gd_yr[gd_yr["Grade level"] == "Grade 9"]["Year of SHST"],
                y = gd_yr[gd_yr["Grade level"] == "Grade 9"]["Number of students who took the SHSAT"],
                marker = dict(color = "rgb(255, 51, 51)",
                              line = dict(width = 2,color = "grey")
                             ),
                opacity = .8,
                name = "Grade 9 - Test taken Students",
               )
               

data  = [trace1,trace3,trace2,trace4]
layout = go.Layout(dict(title = "Yearly Grade wise Total Students who registered and took test",
                        xaxis = dict(showgrid = True,
                                     title = "Year",
                                    ),
                         yaxis = dict(showgrid = True,
                                     title = "Count",
                                    ),
                        paper_bgcolor = "rgb(230, 230,230)",
                        plot_bgcolor  = "rgb(230, 230,230)",
                       ),
                  )

fig  = go.Figure(data = data,layout = layout)
py.iplot(fig)

# Total Students who registered and took test from 2013 - 2016

In [None]:
trace = go.Scatter(y = shsat[shsat["Year of SHST"] == 2013]["Number of students who registered for the SHSAT"],
                   x = shsat[shsat["Year of SHST"] == 2013]["Number of students who took the SHSAT"],
                   mode = "markers",
                   marker = dict(size = 8,
                                 color = "blue",
                                 line = dict(width = 1,color = "black"),
                                ),
                   text = shsat[shsat["Year of SHST"] == 2013]["School name"],
                   name = "2013",
                   opacity = .7
                 )
trace1 = go.Scatter(y = shsat[shsat["Year of SHST"] == 2014]["Number of students who registered for the SHSAT"],
                   x = shsat[shsat["Year of SHST"] == 2014]["Number of students who took the SHSAT"],
                   mode = "markers",
                   marker = dict(size = 8,
                                 color = "red",
                                 line = dict(width = 1,color = "black"),
                                ),
                   text = shsat[shsat["Year of SHST"] == 2014]["School name"],
                   name = "2014",
                    opacity = .7
                 )
trace2 = go.Scatter(y = shsat[shsat["Year of SHST"] == 2015]["Number of students who registered for the SHSAT"],
                   x = shsat[shsat["Year of SHST"] == 2015]["Number of students who took the SHSAT"],
                   mode = "markers",
                   marker = dict(size = 8,
                                 color = "green",
                                 line = dict(width = 1,color = "black"),
                                ),
                   text = shsat[shsat["Year of SHST"] == 2015]["School name"],
                   name = "2015",
                    opacity = .7
                 )
trace3 = go.Scatter(y = shsat[shsat["Year of SHST"] == 2016]["Number of students who registered for the SHSAT"],
                   x = shsat[shsat["Year of SHST"] == 2016]["Number of students who took the SHSAT"],
                   mode = "markers",
                   marker = dict(size = 8,
                                 color = "cyan",
                                 line = dict(width = 1,color = "black"),
                                ),
                   text = shsat[shsat["Year of SHST"] == 2016]["School name"],
                   name = "2016",
                    opacity = .7
                 )

data = [trace,trace1,trace2,trace3]

layout = go.Layout(dict(title = "Total Students who registered and took test from 2013 - 2016",
                        xaxis = dict(title = "Number of students who took the SHSAT",
                                     showgrid = True, 
                                    ),
                        yaxis = dict(title = "Number of students who registered for the SHSAT",
                                     showgrid = True,
                                    ),
                        autosize = False,
                        height = 600,
                        width  = 780,
                        paper_bgcolor = "rgb(230, 230,230)",
                        plot_bgcolor  = "rgb(230, 230,230)",
                       ),
                  )

fig = go.Figure(data = data,layout = layout)
py.iplot(fig)