# Preprocessing

In [15]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import statistics
from scipy import stats

In [16]:
df = pd.read_csv("Admission_Predict.csv")
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
395,396,324,110,3,3.5,3.5,9.04,1,0.82
396,397,325,107,3,3.0,3.5,9.11,1,0.84
397,398,330,116,4,5.0,4.5,9.45,1,0.91
398,399,312,103,3,3.5,4.0,8.78,0,0.67


In [17]:
X = df.iloc[0:400, 0:8] # Independant features
Y = df.iloc[0:400, 8] # Target variable

## Splitting data into training and testing sets

In [18]:
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size = 0.3, random_state = 100)
Y_train = Y_train.to_numpy().reshape((Y_train.shape[0],1))
Y_test = Y_test.to_numpy().reshape((Y_test.shape[0],1))


# Linear Regression

In [19]:
def find_optimal_parameters(x, y):
    """ Finding Closed form solution using w = (X^T X)^-1 (X^T) (y)
    """
    #add a column of 1s to include bias in the calculation
    x = np.hstack((np.ones((x.shape[0], 1)), x)) 


    p1 = np.linalg.pinv((np.transpose(x) @ x))
    p2 = np.transpose(x) @ y
    w = p1 @ p2
    return w

### Train linear regression model using training data

In [20]:
def get_pred_Y(trained_w, X_pred):
    """ Return predicted Y
    Args:
    - trained_w (ndarray (Shape: (D+1, 1))): training weights
    - X_pred (ndarray (Shape: (N, D))): predicted values
    
    Output:
    - pred_Y (ndarray (Shape: (N, 1))): 
    """
    pad_Y     = np.hstack((np.ones((X_pred.shape[0], 1)), X_pred))
    pred_Y    = np.transpose(np.transpose(trained_w) @ np.transpose(pad_Y))
    
    return pred_Y

In [21]:
def get_mae(Y_truth, Y_pred):
    """ Return Mean absolute error
    Args:
    - Y_truth (ndarray (Shape: (N, 1)))
    - Y_pred (ndarray (Shape: (N, 1)))
    
    Output:
    - MSE (ndarray (Shape: (1,))).
    """
    
    
    if np.shape(Y_truth) != np.shape(Y_pred):
        return -1
    
    Y_mean = sum(abs(Y_truth - Y_pred))
    mae    = Y_mean / Y_truth.shape[0]
    return mae

def get_mse(Y_truth, Y_pred):
    """ Return Mean squared error
    Args:
    - Y_truth (ndarray (Shape: (N, 1)))
    - Y_pred (ndarray (Shape: (N, 1))):
    
    Output:
    - MSE (ndarray (Shape: (1,))).
    """

    if np.shape(Y_truth) != np.shape(Y_pred):
        return -1
    
    Y_mean = sum((Y_truth - Y_pred)**2)
    mse    = Y_mean / Y_truth.shape[0]
    return mse

### Get predictions on train data

In [22]:
w_optimal = find_optimal_parameters(X_train, Y_train)
w_optimal

array([[-1.24508024e+00],
       [ 1.74603344e-04],
       [ 1.09496275e-03],
       [ 4.24602435e-03],
       [ 9.48357452e-03],
       [-6.36173836e-03],
       [ 2.51184521e-02],
       [ 1.18743394e-01],
       [ 2.36731783e-02]])

In [23]:
pred_Y = get_pred_Y(w_optimal, X_train)
print('train error (MSE): ', get_mse(Y_train, pred_Y))
print('train error (MAE): ', get_mae(Y_train, pred_Y))

train error (MSE):  [0.00376967]
train error (MAE):  [0.0446701]


### Get predictions and performance on test data

In [24]:
pred_Y    = get_pred_Y(w_optimal, X_test)
print('test error (MSE):: ', get_mse(Y_test, pred_Y))
print('test error (MAE): ', get_mae(Y_test, pred_Y))

test error (MSE)::  [0.00361941]
test error (MAE):  [0.04688106]


# K Means

## Silouette Coefficient

In [25]:
n_silhouette = [2, 3, 4, 5, 6, 7, 8, 9, 10]

kmeans_kwargs= {
    "init":"k-means++",
    "n_init":30,
    "max_iter":250,
    "random_state":2
}


#code obtained from the sklearn site provided
for n_clusters in n_silhouette:
    clusterer = KMeans(n_clusters = n_clusters, random_state = 100)
    cluster_labels = clusterer.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

For n_clusters = 2 The average silhouette_score is : 0.6071096314863135
For n_clusters = 3 The average silhouette_score is : 0.5531574352681752
For n_clusters = 4 The average silhouette_score is : 0.5151391666241323
For n_clusters = 5 The average silhouette_score is : 0.48464943440830055
For n_clusters = 6 The average silhouette_score is : 0.45670717596148136
For n_clusters = 7 The average silhouette_score is : 0.43409307822104387
For n_clusters = 8 The average silhouette_score is : 0.41165177333474423
For n_clusters = 9 The average silhouette_score is : 0.38942632231084867
For n_clusters = 10 The average silhouette_score is : 0.3760333777270851


In [26]:
kmeans = KMeans(
    init="k-means++",
    n_clusters = 2, #Found above (greatest silhouette score)
    n_init=30,
    max_iter=250,
    random_state=2
)

kmeans.fit(X_train)

training_df_clustered = X_train

train_cluster = kmeans.fit_predict(X_train)

# Add each row to a specific cluster
training_df_clustered['cluster'] = train_cluster

number_cluster = 2

X_train_clusters_df = []
for i in range(number_cluster):
    X_train_clusters_df.append(training_df_clustered[training_df_clustered['cluster']==i])

# Building Linear Regression for our clusters
## Using sklearn linear regression

In [27]:
from sklearn.linear_model import LinearRegression


# Set the number of clusters based on the silhouette coefficient analysis found above
number_cluster = 2
obj_cluster = []



for i in range(number_cluster):
    linear_regression = LinearRegression()
    
    #Get the specific X_train values according to their predicted clusters.
    X_clustered_data = np.delete(np.array(X_train_clusters_df[i]), -1, axis = 1)
    #Get the specific Y_train values according to their predicted clusters.
    Y_data = np.empty([np.shape(X_train_clusters_df[i])[0], 1])
    
    counter = 0
    
    for j in X_train_clusters_df[i]['Serial No.']:
        Y_data[counter] = df.iloc[j-1][-1]
        counter+= 1
    Y_clustered_data = np.array(Y_data)
    Y_clustered_data.reshape((Y_clustered_data.shape[0],1))
    
    obj_cluster.append(linear_regression.fit(X_clustered_data, Y_clustered_data)) 
    

In [28]:
def predict_value(x_test, kmeans, cluster_linear):
    """
    Input: 
    x_test is the test value that you wish to predict on.
    kmeans is the kmeans object that you have finalized to predict on the test dataset.
    cluster_linear is the list of fitted models on different clusters.

    Return:
    linear_pred - linear_pred will be type list with prediction values
    clusters - clusters_pred will be the prediction of clusters using k means.

    """
    linear_pred = []
    clusters = []
    
    #linear_pred = cluster_linear.predict(np.array(x_test))
    clusters = kmeans.predict(x_test)
    
    testing_df_cluster = x_test.copy()
    testing_df_cluster['cluster'] = clusters
    testing_df_cluster = np.array(testing_df_cluster)
    
    #predicting based on clusters
    for i in range(np.shape(testing_df_cluster)[0]):
        if testing_df_cluster[i][-1] == 1:
            linear_pred.append(cluster_linear[1].predict(testing_df_cluster[i][:-1].reshape(1,-1)))
        else:
            linear_pred.append(cluster_linear[0].predict(testing_df_cluster[i][:-1].reshape(1,-1)))
            
    linear_pred = np.vstack(linear_pred)
    
    return linear_pred, clusters

## Checking testing error

In [29]:
#Applying the clustering-based linear regression to the test set.
(linear_predicted, clusters_predicted) = predict_value(X_test, kmeans, obj_cluster)

In [30]:
print('test error (MSE):: ', get_mse(Y_test, linear_predicted))
print('test error (MAE): ', get_mae(Y_test, linear_predicted))

test error (MSE)::  [0.0033131]
test error (MAE):  [0.04268426]
