In [None]:
import os
import sys

import pandas as pd
import numpy as np

In [None]:
project_home = os.path.join("../")
sys.path.append(project_home)
data_path = os.path.join(project_home, "data/merged_preprocessed.csv")

In [None]:
df = pd.read_csv(data_path, encoding="latin1")

In [None]:
new_cols = ["state", "city", "speciality", "girl_exclusive", "student_hostel_available"]

In [None]:
df[new_cols]

In [None]:
df["type"].value_counts()

In [None]:
df["discipline_group"].nunique()

# Preprocessing and EDA

In [None]:
features_for_classification = [
    "levell", "programme", "discipline_group", "type", "year", "total_general_total", "pwd_total_persons",
    "other_minority_total_persons", 'state', 'speciality', 'girl_exclusive', 'student_hostel_available'
]

Y_col = "muslim_minority_total_persons"

In [None]:
X = df[features_for_classification].copy()

In [None]:
Y = df[Y_col].copy()

In [None]:
y = Y > 0.05

In [None]:
y.value_counts()

In [None]:
X.columns

In [None]:
import category_encoders as ce

In [None]:
ce_ord = ce.BinaryEncoder(cols = ['levell', 'programme', 'discipline_group', 'type', 'state', 'speciality'])
X = ce_ord.fit_transform(X)

In [None]:
X["student_hostel_available"] = X["student_hostel_available"].apply(lambda x: 1 if x else 0)

## IGNORE the PCA and plotting for now:

In [None]:
from sklearn.decomposition import PCA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(X.values)

In [None]:
plot_df = pd.DataFrame()
plot_df['pca-one'] = pca_result[:,0]
plot_df['pca-two'] = pca_result[:,1] 
plot_df['pca-three'] = pca_result[:,2]
plot_df["y"] = y

In [None]:
rndperm = np.random.permutation(X.shape[0])[:10000]

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 2),
    data=plot_df.loc[rndperm,:],
    legend="full",
    alpha=0.3
)

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(
    xs=plot_df.loc[rndperm,:]["pca-one"], 
    ys=plot_df.loc[rndperm,:]["pca-two"], 
    zs=plot_df.loc[rndperm,:]["pca-three"], 
    c=plot_df.loc[rndperm,:]["y"]
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

# Classification 

In [None]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers

In [None]:
X.shape

In [None]:
X = X.values

In [None]:
y = y.astype(int).values

In [None]:
y = y.reshape(375197, 1)

In [None]:
X.shape, y.shape

In [None]:
def split_data(X, Y):
    shuffle = np.random.permutation(X.shape[0])
    X_test = X[shuffle[:20000], :]
    Y_test = Y[shuffle[:20000], :]
    X_dev = X[shuffle[20001:40000], :]
    Y_dev = Y[shuffle[20001:40000], :]
    X_train = X[shuffle[40000:], :]
    Y_train = Y[shuffle[40000:], :]
    
    return X_train, Y_train, X_dev, Y_dev, X_test, Y_test

In [None]:
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = split_data(X, y)

In [None]:
X_train.shape, Y_train.shape

## Logistic regression:

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_model = LogisticRegression(verbose=1, max_iter=1000000, C=0.1).fit(
    X_train, 
    Y_train.reshape((Y_train.shape[0]),)
)

In [None]:
logistic_model.score(X_train, Y_train)

In [None]:
preds = logistic_model.predict(X_dev)
count=0
for idx, pred in enumerate(preds):
    if pred == Y_dev[idx]:
        count += 1
print("Dev accuracy = {}".format(count/len(preds)))

In [None]:
preds = logistic_model.predict(X_test)
count=0
for idx, pred in enumerate(preds):
    if pred == Y_test[idx]:
        count += 1
print("Test accuracy = {}".format(count/len(preds)))

## Neural Network approach:

In [None]:
batch_size = 32
learning_rate = 0.0003
l2_lambda = 0.01
dropout_rate = 0.4
epochs= 3

In [None]:
def create_dataset(x, y, batch_size=16):
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    return tf.data.Dataset.from_tensor_slices((x, y)).shuffle(y.shape[0]).batch(batch_size)

In [None]:
train_dataset = create_dataset(X_train, Y_train, batch_size)
dev_dataset = create_dataset(X_dev, Y_dev, batch_size)

In [None]:
model = keras.Sequential([
    keras.layers.Dense(units=32, activation='relu', input_dim=44, kernel_regularizer=regularizers.l2(l2_lambda)),
#     keras.layers.Dropout(rate=dropout_rate)
#     keras.layers.Dense(units=8, activation='relu', kernel_regularizer=regularizers.l2(l2_lambda)),
#     keras.layers.Dropout(rate=dropout_rate),
    keras.layers.Dense(units=2, activation='softmax')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset, 
    epochs=epochs, 
    validation_data=dev_dataset,
)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
history.history