In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#There are 2 major preprocessing steps to be performed before fitting any clustering models -

#1)Removing outliers. Outliers can be very damaging in clustering algorithms like K-Means because even a single outlier can dramatically influence a cluster center. 

#2)Normalizing the data. Again, distance-sensitive algorithms like K-Means treat observations as vectors in a vector space with an implied inner product like Euclidean distance. This means all features need to have equal weights in feature vectors and that means normalizing their scales

In [2]:
df = pd.read_csv('./data/EDACollegeScorecard.csv')

In [18]:
#First, lets remove all observations with values in a column that are more than 3 standard deviations away from that column's mean
#Source: 'https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame'
from scipy import stats

print(len(df))
numeric_df = df.select_dtypes("number") #We may as well drop the INSTURL column, it does not add any meaningful value to the dataset
new_df = numeric_df[(np.abs(stats.zscore(numeric_df)) < 3).all(axis=1)]
print(len(new_df))

7803
2577


In [None]:
#That seems to drop way too much data, more than half. Perhaps we should be more sparing about our standard deviation threshold ...

In [24]:
print(len(df))
numeric_df = df.select_dtypes("number") #We may as well drop the INSTURL column, it does not add any meaningful value to the dataset
new_df = numeric_df[(np.abs(stats.zscore(numeric_df)) < 8).all(axis=1)]
print(len(new_df))

7803
6295


In [None]:
#Around 1600 observations were dropped. It's a decent amount of data, but we still have more than 6000 observations to build a potentially more robust clustering model. If the models dont perform well, we can come back to this step and tweak the SD threshold. 

In [25]:
#Now, we need to normalize the data
from sklearn import preprocessing

x = new_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
normalized_df = pd.DataFrame(x_scaled)
normalized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,213,214,215,216,217,218,219,220,221,222
0,0.000000,0.000000,0.000000e+00,0.000461,0.001188,0.0,1.0,0.00,0.0,0.000000,...,0.711863,0.730729,0.751029,0.753725,0.720400,0.760188,0.745171,0.744139,0.736720,0.759968
1,0.000128,0.000128,5.603398e-07,0.226011,0.582669,0.0,1.0,0.00,0.5,0.000000,...,0.579620,0.659221,0.677674,0.611111,0.583138,0.736207,0.585495,0.672211,0.610722,0.642391
2,0.000256,0.000256,8.923931e-07,0.000489,0.001261,0.0,1.0,0.00,0.0,0.000000,...,0.708883,0.794698,0.797386,0.788247,0.726713,0.825786,0.753531,0.789209,0.764913,0.787138
3,0.000641,0.000641,2.013073e-06,0.000038,0.000097,0.0,1.0,0.00,0.0,0.000000,...,0.324754,0.505528,0.662853,0.501976,0.342870,0.663019,0.388141,0.444554,0.427713,0.425513
4,0.000769,0.000769,3.092246e-06,0.000047,0.000121,0.0,1.0,0.00,0.0,0.000000,...,0.697118,0.763222,0.857456,0.877229,0.724479,0.792340,0.735767,0.801658,0.766960,0.757499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6290,0.999487,0.999487,9.999999e-01,0.005343,0.013772,0.0,0.0,0.05,0.0,0.155844,...,0.394902,0.461457,0.452632,0.451434,0.402464,0.540125,0.457405,0.437099,0.459960,0.484178
6291,0.999615,0.999615,9.999999e-01,0.005342,0.013772,0.0,0.0,0.05,0.0,0.155844,...,0.394902,0.461457,0.452632,0.451434,0.402464,0.540125,0.457405,0.437099,0.459960,0.484178
6292,0.999744,0.999744,1.000000e+00,0.005343,0.013772,0.0,0.0,0.05,0.0,0.155844,...,0.394902,0.461457,0.452632,0.451434,0.402464,0.540125,0.457405,0.437099,0.459960,0.484178
6293,0.999872,0.999872,1.000000e+00,0.005342,0.013772,0.0,1.0,0.05,0.0,0.155844,...,0.394902,0.461457,0.452632,0.451434,0.402464,0.540125,0.457405,0.437099,0.459960,0.484178


In [26]:
#There is no reason to split this dataframe into a train and test set, since this is an unsupervised learning task. We want our clustering algorithms to use as much data as possible. So we are done with preprocessing, all's that's left is to save the df

normalized_df.to_csv('./data/PreprocessedCollegeScorecard.csv')