# School Data Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('data.csv')
data

In [None]:
print('Number of rows and columns: ', data.shape)

In [None]:
print('columns are: ', list(data.columns), end=',')

In [None]:
print('data types are: \n', data.dtypes)

In [None]:
data.head()

In [None]:
data.tail()

### Details of dataset


 * **province_id**	id of the school province
 
 * **province_name**	province name
 
 * **city_id**	id of the school city (can also be kabupaten in Indonesia)
 
 * **city_name**	city (kabupaten) name
 
 * **disctrict_id**	id of the school district (kecamatan in Indonesia)
 
 * **district_name**	district (kecamatan) name
 
 * **npsn**	unique school id
 
 * **school_name**	name of the school. 
     - The prefix (e.g. SD,SMP,SMA,SMK) described in stage feature
     
  * **stage**	Educational stage of school. 
     - SD = Elementary Schol, 
     - SMP = Junior High Scool, 
     - SMA/SMK = Senior High School. 
         SMA and SMK differentiate from how one (SMK) is mainly focus on develping student practical skill status 2 main division based on who develop the school.
    
    - N stands for "Negeri" (public school), 
    - S stands for "Swasta" (private school)

 * **street_name**	street where the school is located
 
 * **lat**	geolocation latitude coordinate of the school. Range from -11 to 6
 
 * **long**	geolocation longitude coordinate of the school. Range from 95 to 141

## description of the data

In [None]:
data.describe()

## Renaming column 

In [None]:
data = data.rename(columns={'npsn':'school_id'})
data

## check whether dataset contains null values or not

In [None]:
data.isnull().sum()

## check whether dataset contains duplicate values or not

In [None]:
data.duplicated().sum() #no duplicated values

## Drop all the null values in the dataset2

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.dropna(inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['school_id'] = data['school_id'].astype('int64')
data['district_id'] = data['district_id'].astype('int64')
data['city_id'] = data['city_id'].astype('int64')

In [None]:
data.shape

In [None]:
data.info()

## save the cleaned csv file2

In [None]:
data.to_csv(r"C:\Users\SHASHANK K\PycharmProjects\Project Files\School Data Analysis\cleaned.csv")

# Data Visualization

## read the cleaned csv

In [None]:
data = pd.read_csv(r"C:\Users\SHASHANK K\PycharmProjects\Project Files\School Data Analysis\cleaned.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## check data empty values through colormap

In [None]:
sns.heatmap(data.isnull(), cmap='coolwarm')

        The map conveys that there are 0 null values in the dataset via grey color (0th color in color index)

# Graph Plotting 

## Number of schools in each province

In [None]:
plt.title('Number of schools in each province')

ax = sns.countplot(x = 'province_name', data = data[:4000])

for bars in ax.containers: #display labels in plot with a numbers
    ax.bar_label(bars)
    
plt.show()

## Number of schools in each city

In [None]:
plt.title("Number of schools in each city")
plt.xlabel('Count')
plt.ylabel('City name')

bx = sns.countplot(x = 'city_name', data = data[:200])

for bars in bx.containers: #display labels in plot with a numbers
    bx.bar_label(bars)

## Number of type of schools in each district

In [None]:
plt.title('number of type of schools in each district')

cx = sns.lineplot(data = data[50:100], x = 'district_id', y = 'status')

for bars in cx.containers:
    cx.bar_label(bars)

plt.show()

In [None]:
plt.title('Number of type of schools in each district')



dx = sns.barplot(data=data[:5000], x = 'province_id', y = 'city_id') #only numeric values
for bars in dx.containers:
    dx.bar_label(bars)
    
plt.figure(figsize=(8, 3))
plt.show()

In [None]:
data.info()

# Machine Learning Analysis

In [None]:
data

## read the cleaned data

In [None]:
df = pd.read_csv(r"C:\Users\SHASHANK K\PycharmProjects\Project Files\School Data Analysis\cleaned.csv")

## split the cleaned dataset for training and testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.loc[:200000, ['province_id', 'city_id', 'district_id', 'school_id']]
y = df.loc[:200000, ['status']]

In [None]:
print(x.shape)
print(y.shape)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

In [None]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

## Machine Learning Models

   ## Regression Models
   
        Linear regression
        Polynomial regression
        ridge regression
        lasso regression
        Elastic regression

   ## classification Models
   
        logistic regression
        decision tree
        random forest
        SVM
        KNN
        naive bayes
        
   ## Regression Models are used for continuous data (int & Float)
   ## Classification Models are used for discrete data (only int)
        
 * This problem **classifies** whether school is Nageri or Swastha Hence, **classification** models are used

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

import joblib #to save trained model in low size.

### Decision tree

In [None]:
model1 = DecisionTreeClassifier()

model1.fit(xtrain, ytrain)

ypred1 = model1.predict(xtest)

print(f"Decision tree scores: {round(accuracy_score(ytest, ypred1)*100, 2)}% Accuracy")

joblib.dump(model1, 'Decision Tree Model.pkl')

In [None]:
model2 = RandomForestClassifier(n_estimators=200)

model2.fit(xtrain, ytrain)

ypred2 = model2.predict(xtest)

print(f"Random Forest Model scores: {round(accuracy_score(ytest, ypred2)*100, 2)}% Accuracy")

joblib.dump(model2, 'Random Forest Model.pkl')

In [None]:
model3 = SVC()

model3.fit(xtrain, ytrain)

ypred3 = model3.predict(xtest)

print(f"SVM scores: {round(accuracy_score(ytest, ypred3)*100, 2)}% accuracy")

joblib.dump(model3, 'SVM Model.pkl')

In [None]:
model4 = LogisticRegression()

model4.fit(xtrain, ytrain)

ypred4 = model4.predict(xtest)

print(f"Logistic Regression Model scores: {round(accuracy_score(ytest, ypred4)*100, 2)}% accuracy")

joblib.dump(model4, 'Logistic Regression Model.pkl')

In [None]:
model5 = KNeighborsClassifier(n_neighbors=10)

model5.fit(xtrain, ytrain)

ypred5 = model5.predict(xtest)

print(f"KNN Model scores: {round(accuracy_score(ytest, ypred5)*100, 2)}% accuracy")

joblib.dump(model5, 'KNN Model.pkl')

In [None]:
model6 = GaussianNB()

model6.fit(xtrain, ytrain)

ypred6 = model6.predict(xtest)

print(f"KNN Model scores: {round(accuracy_score(ytest, ypred6)*100, 2)}% accuracy")

joblib.dump(model6, 'Naive Bayesian Model.pkl')

# Prediction

In [None]:
model = joblib.load(r"C:\Users\SHASHANK K\PycharmProjects\Project Files\School Data Analysis\ML Models\Random Forest Model.pkl")

In [None]:
sample = [[12546, 54859, 25648, 21564789]]
prediction  = model.predict(sample)
print(prediction[0])

# Error Analysis

Error Analysis are of below types:

    1) r_square error
    
    2) Precision
    
    3) recall
    
    4) F-Score
    
    5) confusion matrix
    
    6) classification report
    
    7) mean square error
    
    8) accuracy score

In [None]:
r_squared = r2_score(ytest, ypred1)
print(f"R-squared Error: {r_squared:.2f}") 

In [None]:
# 2. Precision
precision = precision_score(ytest, ypred1)
print(f"Precision: {precision:.2f}")

In [None]:
# 3. Recall
recall = recall_score(ytest, ypred1)
print(f"Recall: {recall:.2f}")

In [None]:
# 4. F-Score (F1-Score)
f_score = f1_score(ytest, ypred1)
print(f"F-Score (F1-Score): {f_score:.2f}")

In [None]:
# 5. Confusion Matrix
conf_matrix = confusion_matrix(ytest, ypred1)
print(f"Confusion Matrix:\n{conf_matrix}")

In [None]:
# 6. Classification Report
class_report = classification_report(ytest, ypred1)
print(f"Classification Report:\n{class_report}")

In [None]:
# 7. Mean Squared Error
mse = mean_squared_error(ytest, ypred1)
print(f"Mean Squared Error: {mse:.2f}")

In [None]:
# 8. Accuracy Score
accuracy = accuracy_score(ytest, ypred1)
print(f"Accuracy Score: {accuracy:.2f}")