## Indian School Education Statistics
https://www.kaggle.com/datasets/vidyapb/indian-school-education-statistics

Данный проект имеет задачу прогнозирования валового охвата в 2015-2016 годах с использованием данных сессии 2012-2015 годов, таких как процент отсева, наличие воды и компьютерного оборудования. Решение задачи основано на проблеме классификации, где общее количество учащихся разделено на две группы.

Ниже приведены этапы, выполненные в данном проекте:

1. Импортируются необходимые библиотеки и загружаются данные из CSV-файлов.
2. Выполняется предварительная обработка данных, включая исправление имен штатов и заполнение пропущенных значений с помощью метода SimpleImputer.
3. Данные о проценте отсева обрабатываются для создания средних значений для каждого типа учащихся и уровня образования (первичное, среднее, старшее и высшее).
4. Аналогичные шаги выполняются для данных о наличии воды, компьютерного оборудования, электричества и туалетов для мальчиков и девочек.
5. Создаются наборы данных для каждого уровня образования, объединяя средние значения различных параметров.
6. Подготавливаются наборы данных для оценки, включая данные за 2015-2016 годы.
7. Строятся модели машинного обучения для каждого уровня образования, в данном случае Random Forest Classifier.
8. Оценивается производительность моделей с помощью различных метрик, таких как accuracy score, roc-auc score, precision score, log loss, f1 score и mean squared error.

In [1]:
#нужно для того, что бы скрывать ворнинги 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer #нужно для заполнения пропусков
from sklearn.model_selection import train_test_split #для создания подвыборок
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold #кроссвалидация
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, log_loss, f1_score, mean_squared_error

In [2]:
drop_out = pd.read_csv('dropout-ratio-2012-2015.csv')
enrol = pd.read_csv('gross-enrollment-ratio-2013-2016.csv')
comp = pd.read_csv('percentage-of-schools-with-comps-2013-2016.csv')
elect = pd.read_csv('percentage-of-schools-with-electricity-2013-2016.csv')
water = pd.read_csv('percentage-of-schools-with-water-facility-2013-2016.csv')
boys = pd.read_csv('schools-with-boys-toilet-2013-2016.csv')
girls = pd.read_csv('schools-with-girls-toilet-2013-2016.csv')

In [3]:
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Arunachal Pradesh" if x == 'Arunachal  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Madhya Pradesh" if x == 'Madhya  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Tamil Nadu" if x == 'Tamil  Nadu' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Andaman & Nicobar Islands" if x == 'A & N Islands' else x)

In [4]:
drop_out.sort_values(by=['State_UT','year'],inplace=True,ignore_index=True)
enrol.sort_values(by=['State_UT','Year'], inplace=True,ignore_index=True)
comp.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
elect.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
water.sort_values(by=['State/UT','Year'], inplace=True,ignore_index=True)
boys.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
girls.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)

In [5]:
elect.head(3)

Unnamed: 0,State_UT,year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,46.38,69.56,94.92,45.97,93.72,83.12,86.84,75.08,82.88,91.01,56.78
1,All India,2014-15,49.63,73.26,95.27,48.33,94.81,84.03,89.14,77.55,83.85,92.67,60.01
2,All India,2015-16,52.4,76.44,94.81,49.86,94.48,88.1,90.82,81.5,86.85,92.86,62.81


In [6]:
comp.head(3)

Unnamed: 0,State_UT,year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,9.25,42.14,82.67,18.22,76.5,64.69,59.25,44.8,39.31,26.65,24.08
1,All India,2014-15,10.32,44.39,81.96,19.4,79.86,62.66,68.11,51.27,50.26,47.96,26.42
2,All India,2015-16,10.36,45.46,79.06,19.78,82.04,68.67,68.4,53.05,54.05,55.72,27.31


In [7]:
def CreateDataSets(mean_dataframes, tier):
    columns = mean_dataframes.columns.to_list()[0:]
    index = mean_dataframes[columns[0]]
    print('Create Dataset with {} features'.format(len(columns))) 

    if tier == 1:
        coresult_1 = mean_dataframes[columns[1]]
        coresult_2 = mean_dataframes[columns[2]]
        coresult_3 = mean_dataframes[columns[3]]
    elif tier == 2:
        coresult_1 = mean_dataframes[columns[4]]
        coresult_2 = mean_dataframes[columns[5]]
        coresult_3 = mean_dataframes[columns[6]]
    elif tier == 3:
        coresult_1 = mean_dataframes[columns[7]]
        coresult_2 = mean_dataframes[columns[8]]
        coresult_3 = mean_dataframes[columns[9]]

    elif tier == 4:
        coresult_1 = mean_dataframes[columns[10]]
        coresult_2 = mean_dataframes[columns[11]]
        if len(columns)>12: coresult_3 = mean_dataframes[columns[12]]
        else: coresult_3 = pd.Series(np.ones(len(mean_dataframes)))
        
    result = pd.DataFrame(pd.concat([index, coresult_1,coresult_2,coresult_3], axis = 1))
    result.set_index('State_UT', inplace=True)
    return result

In [8]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name + '_'

def PreprocessFiles(dataframe, categories, year_filter=['2013-14','2014-15'], year='year', city = 'State_UT'):
    '''
    year_filter: years to be used while aggregating the dataset
    '''
    
    tmp = []
    if len(year_filter)>1:
        for categ in categories:
            dataframe[categ] = dataframe[categ].astype(float)                # Если тип данных наших числовых функций является объектом, нам нужно изменить его на float.
            tmp.append(pd.DataFrame({'mean_' + categ : dataframe.iloc[np.where( (dataframe[year]==year_filter[0]) | (dataframe[year]==year_filter[1]))].groupby([city])[categ].mean()}))  
    
    else:
        for categ in categories:
            dataframe[categ] = dataframe[categ].astype(float)                # Если тип данных наших числовых функций является объектом, нам нужно изменить его на float.
            tmp.append(pd.DataFrame({'mean_' + categ : dataframe.iloc[np.where( (dataframe[year]==year_filter[0]) )].groupby([city])[categ].mean()}))  
    
    mean_dataframe_per_state = pd.DataFrame(tmp[0])     # Первоначально добавляем первый тип категории учащихся в DataFrame, чтобы упростить использование pd.merge().
    for Stu_type in range(1, len(tmp)):           # Начиная с индекса 1, так как я уже инициализировал наш фрейм данных с первым типом Student 
        tmp[Stu_type].reset_index(inplace = True)
        mean_dataframe_per_state = pd.merge(mean_dataframe_per_state, tmp[Stu_type], on = city) 
    columns = mean_dataframe_per_state.columns.to_list()

    new_cols_name = [city]
    new_cols_name.extend([get_df_name(dataframe) + col for col in columns if col != city])
    mapper = {columns[i]: new_cols_name[i] for i in range(len(columns))} 
    mean_dataframe_per_state.rename(columns = mapper, inplace=True)
    
    return mean_dataframe_per_state

In [9]:
drop_out.head(3)

Unnamed: 0,State_UT,year,Primary_Boys,Primary_Girls,Primary_Total,Upper Primary_Boys,Upper Primary_Girls,Upper Primary_Total,Secondary _Boys,Secondary _Girls,Secondary _Total,HrSecondary_Boys,HrSecondary_Girls,HrSecondary_Total
0,All India,2012-13,4.68,4.66,4.67,2.3,4.01,3.13,14.54,14.54,14.54,NR,NR,NR
1,All India,2013-14,4.53,4.14,4.34,3.09,4.49,3.77,17.93,17.79,17.86,1.48,1.61,1.54
2,All India,2014-15,4.36,3.88,4.13,3.49,4.6,4.03,17.21,16.88,17.06,0.25,NR,NR


In [10]:
imputer = SimpleImputer(missing_values = 'NR', strategy='constant', fill_value=0)
imputer_1 = SimpleImputer(missing_values = 'Uppe_r_Primary', strategy='constant', fill_value=0)

In [11]:
drop_out_cols = drop_out.columns.to_list()
drop_out = imputer.fit_transform(drop_out)
drop_out = pd.DataFrame(imputer_1.fit_transform(drop_out), columns=drop_out_cols)

In [12]:
mean_drop_out_per_state = PreprocessFiles(drop_out, drop_out.columns[2:], year_filter=['2012-13', '2013-14'])

In [13]:
test_drop_out = PreprocessFiles(drop_out, drop_out.columns[2:], year_filter=['2014-15'])

In [14]:
mean_drop_out_per_state.head(3)

Unnamed: 0,State_UT,drop_out_mean_Primary_Boys,drop_out_mean_Primary_Girls,drop_out_mean_Primary_Total,drop_out_mean_Upper Primary_Boys,drop_out_mean_Upper Primary_Girls,drop_out_mean_Upper Primary_Total,drop_out_mean_Secondary _Boys,drop_out_mean_Secondary _Girls,drop_out_mean_Secondary _Total,drop_out_mean_HrSecondary_Boys,drop_out_mean_HrSecondary_Girls,drop_out_mean_HrSecondary_Total
0,All India,4.605,4.4,4.505,2.695,4.25,3.45,16.235,16.165,16.2,0.74,0.805,0.77
1,Andaman & Nicobar Islands,1.09,0.785,0.945,0.0,1.315,0.87,6.965,5.765,6.38,18.3,11.175,15.005
2,Andhra Pradesh,3.805,3.72,3.765,3.335,3.815,3.57,12.08,13.31,12.685,7.655,5.425,6.07


In [15]:
primary_drop_out = CreateDataSets(mean_drop_out_per_state, 1)
upp_drop_out = CreateDataSets(mean_drop_out_per_state, 2)
sec_drop_out = CreateDataSets(mean_drop_out_per_state, 3)
higher_drop_out = CreateDataSets(mean_drop_out_per_state, 4)

Create Dataset with 13 features
Create Dataset with 13 features
Create Dataset with 13 features
Create Dataset with 13 features


In [16]:
eval_primary_drop_out = CreateDataSets(test_drop_out, 1)
eval_upp_drop_out = CreateDataSets(test_drop_out, 2)
eval_sec_drop_out = CreateDataSets(test_drop_out, 3)
eval_higher_drop_out = CreateDataSets(test_drop_out, 4)

Create Dataset with 13 features
Create Dataset with 13 features
Create Dataset with 13 features
Create Dataset with 13 features


In [17]:
boys.head(3)

Unnamed: 0,State_UT,year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,84.01,91.44,97.91,88.14,96.05,94.51,80.89,97.97,94.68,74.36,86.56
1,All India,2014-15,85.66,93.93,98.59,88.12,98.63,96.35,87.28,89.1,93.09,88.97,88.62
2,All India,2015-16,96.45,98.4,99.38,96.0,99.51,98.84,98.04,94.56,97.97,95.67,97.02


In [18]:
mean_boys_per_state = PreprocessFiles(boys, boys.columns[2:])

In [19]:
test_boys = PreprocessFiles(boys, boys.columns[2:], year_filter=['2015-16'])

In [20]:
mean_boys_per_state.head(3)

Unnamed: 0,State_UT,boys_mean_Primary_Only,boys_mean_Primary_with_U_Primary,boys_mean_Primary_with_U_Primary_Sec_HrSec,boys_mean_U_Primary_Only,boys_mean_U_Primary_With_Sec_HrSec,boys_mean_Primary_with_U_Primary_Sec,boys_mean_U_Primary_With_Sec,boys_mean_Sec_Only,boys_mean_Sec_with_HrSec.,boys_mean_HrSec_Only,boys_mean_All Schools
0,All India,84.835,92.685,98.25,88.13,97.34,95.43,84.085,93.535,93.885,81.665,87.59
1,Andaman & Nicobar Islands,95.79,98.685,100.0,50.0,100.0,100.0,0.0,0.0,100.0,0.0,97.26
2,Andhra Pradesh,55.47,69.545,89.025,60.225,78.72,83.835,69.31,69.955,72.855,79.99,61.11


In [21]:
primary_boys = CreateDataSets(mean_boys_per_state, 1)
upp_boys = CreateDataSets(mean_boys_per_state, 2)
sec_boys = CreateDataSets(mean_boys_per_state, 3)
higher_boys = CreateDataSets(mean_boys_per_state, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [22]:
higher_boys.drop(columns=[0], inplace=True)

In [23]:
higher_boys.head(2)

Unnamed: 0_level_0,boys_mean_HrSec_Only,boys_mean_All Schools
State_UT,Unnamed: 1_level_1,Unnamed: 2_level_1
All India,81.665,87.59
Andaman & Nicobar Islands,0.0,97.26


In [24]:
eval_primary_boys = CreateDataSets(test_boys, 1)
eval_upp_boys = CreateDataSets(test_boys, 2)
eval_sec_boys = CreateDataSets(test_boys, 3)
eval_higher_boys = CreateDataSets(test_boys, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [25]:
eval_higher_boys.drop(columns=[0], inplace=True)

In [26]:
mean_girls_per_state = PreprocessFiles(girls, girls.columns[2:])

In [27]:
test_girls = PreprocessFiles(girls, girls.columns[2:], year_filter=['2015-16'])

In [28]:
mean_girls_per_state.head(3)

Unnamed: 0,State_UT,girls_mean_Primary_Only,girls_mean_Primary_with_U_Primary,girls_mean_Primary_with_U_Primary_Sec_HrSec,girls_mean_U_Primary_Only,girls_mean_U_Primary_With_Sec_HrSec,girls_mean_Primary_with_U_Primary_Sec,girls_mean_U_Primary_With_Sec,girls_mean_Sec_Only,girls_mean_Sec_with_HrSec.,girls_mean_HrSec_Only,girls_mean_All Schools
0,All India,89.945,96.45,99.145,91.39,98.74,97.76,95.505,94.66,96.205,83.54,92.155
1,Andaman & Nicobar Islands,94.87,98.685,100.0,50.0,100.0,100.0,0.0,0.0,100.0,0.0,96.72
2,Andhra Pradesh,86.975,94.135,98.6,90.91,84.06,95.125,95.56,90.98,85.715,85.1,89.69


In [29]:
primary_girls = CreateDataSets(mean_girls_per_state, 1)
upp_girls = CreateDataSets(mean_girls_per_state, 2)
sec_girls = CreateDataSets(mean_girls_per_state, 3)
higher_girls = CreateDataSets(mean_girls_per_state, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [30]:
higher_girls.drop(columns=[0], inplace=True)

In [31]:
higher_girls.head(3)

Unnamed: 0_level_0,girls_mean_HrSec_Only,girls_mean_All Schools
State_UT,Unnamed: 1_level_1,Unnamed: 2_level_1
All India,83.54,92.155
Andaman & Nicobar Islands,0.0,96.72
Andhra Pradesh,85.1,89.69


In [32]:
eval_primary_girls = CreateDataSets(test_girls, 1)
eval_upp_girls = CreateDataSets(test_girls, 2)
eval_sec_girls = CreateDataSets(test_girls, 3)
eval_higher_girls = CreateDataSets(test_girls, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [33]:
eval_higher_girls.drop(columns=[0], inplace=True)

In [34]:
water.head(3)

Unnamed: 0,State/UT,Year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,94.09,98.18,99.1,93.9,99.2,98.64,97.38,96.52,98.18,97.14,95.4
1,All India,2014-15,94.88,98.49,99.59,94.82,99.38,98.83,98.49,96.32,98.71,98.23,96.12
2,All India,2015-16,95.78,98.69,99.39,95.65,99.59,99.21,98.91,97.05,99.18,97.92,96.81


In [35]:
mean_water_facilities = PreprocessFiles(water, water.columns[2:], city ='State/UT', year = 'Year')

In [36]:
test_water_fac = PreprocessFiles(water, water.columns[2:],city ='State/UT', year = 'Year', year_filter=['2015-16'])
test_water_fac.rename(columns={'State/UT':'State_UT'}, inplace=True)

In [37]:
mean_water_facilities.head(3)

Unnamed: 0,State/UT,water_mean_Primary_Only,water_mean_Primary_with_U_Primary,water_mean_Primary_with_U_Primary_Sec_HrSec,water_mean_U_Primary_Only,water_mean_U_Primary_With_Sec_HrSec,water_mean_Primary_with_U_Primary_Sec,water_mean_U_Primary_With_Sec,water_mean_Sec_Only,water_mean_Sec_with_HrSec.,water_mean_HrSec_Only,water_mean_All Schools
0,All India,94.485,98.335,99.345,94.36,99.29,98.735,97.935,96.42,98.445,97.685,95.76
1,Andaman & Nicobar Islands,98.86,98.725,100.0,50.0,100.0,100.0,0.0,0.0,100.0,0.0,99.105
2,Andhra Pradesh,89.37,95.3,99.83,95.455,88.63,99.2,96.75,98.75,100.0,97.945,92.045


In [38]:
mean_water_facilities.rename(columns={'State/UT':'State_UT'}, inplace=True)

In [39]:
primary_water = CreateDataSets(mean_water_facilities, 1)
upp_water = CreateDataSets(mean_water_facilities, 2)
sec_water = CreateDataSets(mean_water_facilities, 3)
higher_water = CreateDataSets(mean_water_facilities, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [40]:
higher_water.drop(columns=[0], inplace=True)

In [41]:
higher_water.head(2)

Unnamed: 0_level_0,water_mean_HrSec_Only,water_mean_All Schools
State_UT,Unnamed: 1_level_1,Unnamed: 2_level_1
All India,97.685,95.76
Andaman & Nicobar Islands,0.0,99.105


In [42]:
eval_primary_water_fac = CreateDataSets(test_water_fac, 1)
eval_upp_water_fac = CreateDataSets(test_water_fac, 2)
eval_sec_water_fac = CreateDataSets(test_water_fac, 3)
eval_higher_water_fac = CreateDataSets(test_water_fac, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [43]:
eval_higher_water_fac.drop(columns=[0], inplace=True)

In [44]:
elect.head(3)

Unnamed: 0,State_UT,year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,46.38,69.56,94.92,45.97,93.72,83.12,86.84,75.08,82.88,91.01,56.78
1,All India,2014-15,49.63,73.26,95.27,48.33,94.81,84.03,89.14,77.55,83.85,92.67,60.01
2,All India,2015-16,52.4,76.44,94.81,49.86,94.48,88.1,90.82,81.5,86.85,92.86,62.81


In [45]:
mean_elect_facilities = PreprocessFiles(elect, elect.columns[2:])

In [46]:
test_elect_fac = PreprocessFiles(elect, elect.columns[2:], year_filter=['2015-16'])

In [47]:
mean_elect_facilities.head(3)

Unnamed: 0,State_UT,elect_mean_Primary_Only,elect_mean_Primary_with_U_Primary,elect_mean_Primary_with_U_Primary_Sec_HrSec,elect_mean_U_Primary_Only,elect_mean_U_Primary_With_Sec_HrSec,elect_mean_Primary_with_U_Primary_Sec,elect_mean_U_Primary_With_Sec,elect_mean_Sec_Only,elect_mean_Sec_with_HrSec.,elect_mean_HrSec_Only,elect_mean_All Schools
0,All India,48.005,71.41,95.095,47.15,94.265,83.575,87.99,76.315,83.365,91.84,58.395
1,Andaman & Nicobar Islands,81.57,96.175,100.0,50.0,100.0,100.0,0.0,0.0,100.0,0.0,88.875
2,Andhra Pradesh,89.37,94.135,99.66,100.0,76.795,96.895,96.63,95.525,91.665,92.24,91.55


In [48]:
primary_elect = CreateDataSets(mean_elect_facilities, 1)
upp_elect = CreateDataSets(mean_elect_facilities, 2)
sec_elect = CreateDataSets(mean_elect_facilities, 3)
higher_elect = CreateDataSets(mean_elect_facilities, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [49]:
higher_elect.drop(columns=[0], inplace=True)

In [50]:
upp_elect.head(2)

Unnamed: 0_level_0,elect_mean_U_Primary_Only,elect_mean_U_Primary_With_Sec_HrSec,elect_mean_Primary_with_U_Primary_Sec
State_UT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All India,47.15,94.265,83.575
Andaman & Nicobar Islands,50.0,100.0,100.0


In [51]:
eval_primary_elect_fac = CreateDataSets(test_elect_fac, 1)
eval_upp_elect_fac = CreateDataSets(test_elect_fac, 2)
eval_sec_elect_fac = CreateDataSets(test_elect_fac, 3)
eval_higher_elect_fac = CreateDataSets(test_elect_fac, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [52]:
eval_higher_elect_fac.drop(columns=[0], inplace=True)

In [53]:
comp.head(3)

Unnamed: 0,State_UT,year,Primary_Only,Primary_with_U_Primary,Primary_with_U_Primary_Sec_HrSec,U_Primary_Only,U_Primary_With_Sec_HrSec,Primary_with_U_Primary_Sec,U_Primary_With_Sec,Sec_Only,Sec_with_HrSec.,HrSec_Only,All Schools
0,All India,2013-14,9.25,42.14,82.67,18.22,76.5,64.69,59.25,44.8,39.31,26.65,24.08
1,All India,2014-15,10.32,44.39,81.96,19.4,79.86,62.66,68.11,51.27,50.26,47.96,26.42
2,All India,2015-16,10.36,45.46,79.06,19.78,82.04,68.67,68.4,53.05,54.05,55.72,27.31


In [54]:
mean_comp_facilities = PreprocessFiles(comp, comp.columns[2:])

In [55]:
test_comp_fac = PreprocessFiles(comp, comp.columns[2:], year_filter=['2015-16'])

In [56]:
mean_comp_facilities.head(3)

Unnamed: 0,State_UT,comp_mean_Primary_Only,comp_mean_Primary_with_U_Primary,comp_mean_Primary_with_U_Primary_Sec_HrSec,comp_mean_U_Primary_Only,comp_mean_U_Primary_With_Sec_HrSec,comp_mean_Primary_with_U_Primary_Sec,comp_mean_U_Primary_With_Sec,comp_mean_Sec_Only,comp_mean_Sec_with_HrSec.,comp_mean_HrSec_Only,comp_mean_All Schools
0,All India,9.785,43.265,82.315,18.81,78.18,63.675,63.68,48.035,44.785,37.305,25.25
1,Andaman & Nicobar Islands,30.67,75.11,90.925,50.0,97.37,98.96,0.0,0.0,100.0,0.0,55.155
2,Andhra Pradesh,11.51,43.435,87.725,47.725,39.645,68.28,74.91,65.485,50.0,30.46,28.815


In [57]:
primary_comp = CreateDataSets(mean_comp_facilities, 1)
upp_comp = CreateDataSets(mean_comp_facilities, 2)
sec_comp = CreateDataSets(mean_comp_facilities, 3)
higher_comp = CreateDataSets(mean_comp_facilities, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [58]:
higher_comp.drop(columns=[0], inplace=True)

In [59]:
primary_comp.head(2)

Unnamed: 0_level_0,comp_mean_Primary_Only,comp_mean_Primary_with_U_Primary,comp_mean_Primary_with_U_Primary_Sec_HrSec
State_UT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All India,9.785,43.265,82.315
Andaman & Nicobar Islands,30.67,75.11,90.925


In [60]:
eval_primary_comp_fac = CreateDataSets(test_comp_fac, 1)
eval_upp_comp_fac = CreateDataSets(test_comp_fac, 2)
eval_sec_comp_fac = CreateDataSets(test_comp_fac, 3)
eval_higher_comp_fac = CreateDataSets(test_comp_fac, 4)

Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features
Create Dataset with 12 features


In [61]:
eval_higher_comp_fac.drop(columns=[0], inplace=True)

In [62]:
def GenerateTrain(df_list, indexes=[]):
    tmp_train = []
    if len(indexes) == 0:
        for col,_ in enumerate(df_list):
            tmp_train.append(df_list[col])
    elif len(indexes)>=0:
        for _,col in enumerate(indexes):
            tmp_train.append(df_list[col])
    
    train = pd.concat(tmp_train, axis=1)
    return train

In [63]:
pry_useful = [primary_comp,primary_elect, primary_drop_out, primary_boys,primary_girls, primary_water]
pry_df = GenerateTrain(pry_useful)

In [64]:
upp_useful = [upp_comp,upp_elect, upp_drop_out, upp_boys,upp_girls, upp_water]
upp_df = GenerateTrain(upp_useful)

In [65]:
sec_useful = [sec_comp,sec_elect, sec_drop_out, sec_boys,sec_girls, sec_water]
sec_df = GenerateTrain(sec_useful)

In [66]:
high_useful = [higher_comp,higher_elect, higher_drop_out, higher_boys,higher_girls, higher_water]
high_df = GenerateTrain(high_useful)

In [67]:
eval_pry_useful = [eval_primary_comp_fac,eval_primary_elect_fac,eval_primary_drop_out, eval_primary_boys, eval_primary_girls,eval_primary_water_fac]
eval_pry_df = GenerateTrain(eval_pry_useful)

In [68]:
eval_upp_useful = [eval_upp_comp_fac,eval_upp_elect_fac,eval_upp_drop_out, eval_upp_boys, eval_upp_girls,eval_upp_water_fac]
eval_upp_df = GenerateTrain(eval_upp_useful)

In [69]:
eval_sec_useful = [eval_sec_comp_fac,eval_sec_elect_fac,eval_sec_drop_out, eval_sec_boys, eval_sec_girls,eval_sec_water_fac]
eval_sec_df = GenerateTrain(eval_sec_useful)

In [70]:
eval_high_useful = [eval_higher_comp_fac,eval_higher_elect_fac,eval_higher_drop_out, eval_higher_boys, eval_higher_girls,eval_higher_water_fac]
eval_high_df = GenerateTrain(eval_high_useful)