# Data Cleaning and Descriptive Statistics
## (US Educational Finances 1992 - 2016)

# Table of Contents
    1. Notebook Prep
    2. Missing Values
    3. Duplicates
    4. Mixed-Type Data
    5. Descriptive Statistics
    6. Export

### 1. Notebook Prep

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Define path

path = r'D:\Achievement 6'

In [3]:
# Import dataframe

df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'State Finances.csv'))

In [4]:
# Rename columns

df.rename(columns = {'STATE':'state', 'YEAR':'year', 'ENROLL':'enrolled', 'TOTAL_REVENUE':'total_revenue', 'FEDERAL_REVENUE':'federal_revenue', 'STATE_REVENUE':'state_revenue', 'LOCAL_REVENUE':'local_revenue', 'TOTAL_EXPENDITURE':'total_expenditure', 'INSTRUCTION_EXPENDITURE':'instruction_expenditure', 'SUPPORT_SERVICES_EXPENDITURE':'support_services_expenditure', 'OTHER_EXPENDITURE':'other_expenditure', 'CAPITAL_OUTLAY_EXPENDITURE':'capital_outlay_expenditure'}, inplace = True)

In [5]:
# Get dimensions

df.shape

(1275, 12)

### 2. Missing Values

In [6]:
# Check null values

df.isnull().sum()

state                            0
year                             0
enrolled                        51
total_revenue                    0
federal_revenue                  0
state_revenue                    0
local_revenue                    0
total_expenditure                0
instruction_expenditure          0
support_services_expenditure     0
other_expenditure               51
capital_outlay_expenditure       0
dtype: int64

### 3. Duplicates

In [7]:
# Count duplicates

df.duplicated().sum()

0

No duplicates

### 4. Mixed-Type Data

In [8]:
# View data types

df.dtypes

state                            object
year                              int64
enrolled                        float64
total_revenue                     int64
federal_revenue                   int64
state_revenue                     int64
local_revenue                     int64
total_expenditure                 int64
instruction_expenditure           int64
support_services_expenditure      int64
other_expenditure               float64
capital_outlay_expenditure        int64
dtype: object

In [9]:
for col in df.columns.tolist():
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df[weird]) > 0:
        print(col)

No mixed-type data

### 5. Descriptive Statistics

In [10]:
df.describe()

Unnamed: 0,year,enrolled,total_revenue,federal_revenue,state_revenue,local_revenue,total_expenditure,instruction_expenditure,support_services_expenditure,other_expenditure,capital_outlay_expenditure
count,1275.0,1224.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1224.0,1275.0
mean,2004.0,917541.6,9102045.0,767779.9,4223743.0,4110522.0,9206242.0,4768010.0,2682587.0,429950.9,903467.5
std,7.213932,1066514.0,11759620.0,1146992.0,5549735.0,5489562.0,11992790.0,6300569.0,3357214.0,534789.3,1329473.0
min,1992.0,43866.0,465650.0,31020.0,0.0,22093.0,481665.0,265549.0,139963.0,11541.0,12708.0
25%,1998.0,264514.5,2189504.0,189957.5,1165776.0,715121.0,2170404.0,1171336.0,638076.0,103449.2,181507.0
50%,2004.0,649933.5,5085826.0,403548.0,2537754.0,2058996.0,5242672.0,2658253.0,1525471.0,271704.0,510428.0
75%,2010.0,1010532.0,10845160.0,827932.0,5055548.0,4755293.0,10744200.0,5561959.0,3222924.0,517222.2,966148.0
max,2016.0,6307022.0,89217260.0,9990221.0,50904570.0,36105260.0,85320130.0,43964520.0,26058020.0,3995951.0,10223660.0


### 6. Export

In [11]:
# Export cleaned file

df.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'finances_cleaned.csv'))