# Part 1: Cluster Analysis

Installation to install on python/anaconda <br />
`pip install tensorflow` <br />
`pip install seaborn`

This code block is made more as a setup with giving all the necessary imports and functions to use for Cluster Analysis.

In [None]:
import os
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
import shutil
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing, cluster
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(data, name):
    le = preprocessing.LabelEncoder()
    data[name] = le.fit_transform(data[name])
    return le.classes_

# Create dummies columns from categorical values
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name], prefix=name)
    df = pd.concat([df, dummies], axis=1)
    df.drop(name, axis=1, inplace=True)
    return df

## KMeans Implementation

Below we take the imdb dataset and print it while fixing a small issue with the unnamed numbered column.

In [None]:
imdb_dataset = pd.read_csv("./imdb_dataset.csv")
# Seems the csv file is missing the 1st column name
imdb_dataset.rename(columns={'Unnamed: 0':'id'}, inplace=True)
print("All columns: ", imdb_dataset.columns)
imdb_dataset.head(10)

### Vertical Partitioning
Select features for K-means clustering. Because of the large size of data, we first need to partition it in order to prepare us for clustering analysis. We first do vertical partitioning in order to isolate the necessary columns we will use.

In [None]:
db = imdb_dataset[['title', 'genre', 'mpaa_rating', 'imdb_rating', 'critics_score', 'audience_rating','audience_score']]
db

### Preprocess categorical columns and normalize numerical columns
Note: we drop 'title' as is not informative for K-means clustering and 'genre' because we want to use 'genre' later to analyze our clusters

The next step for preprocessing we use is hot encoding in order to binarize the db2 values into 0s to 1s using the z-score normalization.

In [None]:
db2 = encode_text_dummy(db, 'mpaa_rating')
db2 = encode_text_dummy(db2, 'audience_rating')
db2_preprocessed = db2.drop(columns=['title', 'genre'])
# # Z-score each column
db2_preprocessed = (db2_preprocessed-db2_preprocessed.mean()) / db2_preprocessed.std()
db2_preprocessed


### Investigate the optimal number of clusters for K-means

In [None]:
print("All genres: ", db2['genre'].unique())
print("Number of different movie genres in the dataset: ", len(imdb_dataset['genre'].unique()))

### Plot SSE vs # of clusters

With the necessary data preprocessed, we can now take it in order to apply k-means analysis with a number of clusters range from 1 to 40 for getting the amount of clusters compared to the Sum Squared Errors letting us see the distance of each data point from our range of clusters. 

With that data now, we apply it with the matplotlib in order to graph the elbow method of the SSE vs # of clusters.

In [None]:
numClusters = range(1, 40)
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k, n_init=10)
    k_means.fit(db2_preprocessed)
    SSE.append(k_means.inertia_)

plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.plot(numClusters, SSE, marker='o', color='b')

Choosing n_clusters=9 looks like a good fit on the elbow line. 

In [None]:
n_clusters=9

### KMeans Cluster
Split data and keep a small portion (10%) for analyzing predictions.

In [None]:
split_ind = int(len(db2_preprocessed) * 0.9)
data_train = db2_preprocessed[:split_ind]
data_test = db2_preprocessed[split_ind:]
print(f"Train samples: {len(data_train)}, Analysis samples: {len(data_test)}")

### KMeans clustering algorithm

Using the k-means clustering model, we fit the training data above in order to find unique clusters from the dataset. <br />Then, we can cluster each movie title to the closest cluster. 

In [None]:
k_means = cluster.KMeans(n_clusters=n_clusters, max_iter=100, n_init=10, random_state=1)
k_means.fit(data_train) 
labels = k_means.labels_
print("Unique cluster ids: ", np.unique(labels))
clusters_train_df = pd.DataFrame(labels, index=db2.title[:split_ind], columns=['Cluster ID'])
clusters_train_df


Append 'genre' column to analyze our clusters on input data.

In [None]:
# Append 'genre' column to analyze our clusters
clusters_train_df['genre'] = db2.genre[:split_ind].values
clusters_train_df.head(10)


Analyze clusters for genre composition.
Ideally clusters should show grouping of similar genres. </br>
Our clusters have good genre composition as seen below.

In [None]:
print("Genre composition for cluster 1")
print(clusters_train_df.groupby(['Cluster ID', 'genre']).size()[1])

print("Genre composition for cluster 2")
print(clusters_train_df.groupby(['Cluster ID', 'genre']).size()[2])

### Try to visualize our clusters in 2 dimensions
We project the training data to 2D with PCA and then color each sample (movie) with the cluster id color.

In [None]:
from sklearn.decomposition import PCA, KernelPCA
import seaborn as sns

plt.rcParams['figure.figsize'] = [16, 8]
fig, axes = plt.subplots(nrows=1,ncols=2)

data_train_2D = pd.DataFrame(KernelPCA(n_components=2, kernel='linear').fit_transform(data_train), columns=['PC1', 'PC2'])
data_train_2D.plot.scatter(x='PC1', y='PC2', c=clusters_train_df['Cluster ID'], colormap='tab20c', ax = axes[0], subplots=True)

color_labels = clusters_train_df['genre'].unique()
rgb_values = sns.color_palette("Set2", 11)
color_map = dict(zip(color_labels, rgb_values))
data_train_2D.plot.scatter(x='PC1', y='PC2', c=clusters_train_df['genre'].map(color_map), \
                           title='PCA projection of training data colored by cluster ID (left) and by genre (right)', \
                           ax = axes[1], subplots=True)

In [None]:
plt.rcParams['figure.figsize'] = [12, 8]
centroids = k_means.cluster_centers_
centroids_df = pd.DataFrame(centroids,columns=data_train.columns)
pd.DataFrame(KernelPCA(n_components=2, kernel='rbf').fit_transform(centroids_df), columns=['PC1', 'PC2']) \
    .plot.scatter(x='PC1', y='PC2', title="2D Visualization (PCA projection) of KMeans centroids")

### Apply trained KMeans algorithm to the held out data.

In [None]:
# Compute cluster labels for unseen movies using trained KMeans
labels = k_means.predict(data_test)
labels = labels.reshape(-1,1)
# Print SSE on test
print("Model inertia: ", k_means.inertia_)

# Create a dataframe that has new movies and their cluster assignment
newmovies = db[split_ind:].copy()
newmovies['Cluster ID'] = labels
print("Cluster allocation for new, unused in training movies")
newmovies

## Hierarchical Analysis on the IMDB dataset

In [None]:
imdb_dataset

## Single Link

Here we apply encoding to the dataset in order to change the text string values into a integer value.

In [None]:
encode_text_index(imdb_dataset, 'title_type')
encode_text_index(imdb_dataset, 'mpaa_rating')
encode_text_index(imdb_dataset, 'critics_rating')
encode_text_index(imdb_dataset, 'audience_rating')
encode_text_index(imdb_dataset, 'best_pic_nom')
encode_text_index(imdb_dataset, 'best_pic_win')
encode_text_index(imdb_dataset, 'best_actor_win')
encode_text_index(imdb_dataset, 'best_actress_win')
encode_text_index(imdb_dataset, 'best_dir_win')
encode_text_index(imdb_dataset, 'top200_box')
imdb_dataset

Limiting the dataset so clustering plot is more readable & displaying the dendrogram for single link hierarchical clustering.

In [None]:
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
%matplotlib inline


Y = imdb_dataset['genre']
X = imdb_dataset.drop(['id', 'title','genre', 'runtime', 'studio', 'thtr_rel_year', 'thtr_rel_month', 'thtr_rel_day', 'dvd_rel_year', 'dvd_rel_month', 'dvd_rel_day',
               'director', 'actor1', 'actor2', 'actor3', 'actor4', 'actor5', 'imdb_url', 'rt_url'],axis=1)

# Minimizing the rows by choosing 40 random movies
names = imdb_dataset['title'].sample(n=40, random_state=0)
X = X.sample(n=40, random_state=0)

Z = hierarchy.linkage(X.values, 'single')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

### Complete Link

This time, we do hierarchical clustering with complete linkage which calculates gets the max distance between clusters then displaying the dendrogram.

In [None]:
Z = hierarchy.linkage(X.values, 'complete')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

### Group Average

Lastly, we do the group average method of hierarchical clustering getting average distance instead between cluster points.

In [None]:
Z = hierarchy.linkage(X.values, 'average')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

# Part 2: Text Mining

### Dataset for text mining:

In [None]:
#From the assignment 4 directions
text_dataset = [ 'Now for manners use has company believe parlors.',
'Least nor party who wrote while did. Excuse formed as is agreed admire so on result parish.',
'Put use set uncommonly announcing and travelling. Allowance sweetness direction to as necessary.',
'Principle oh explained excellent do my suspected conveying in.',
'Excellent you did therefore perfectly supposing described. ',
'Its had resolving otherwise she contented therefore.',
'Afford relied warmth out sir hearts sister use garden.',
'Men day warmth formed admire former simple.',
'Humanity declared vicinity continue supplied no an. He hastened am no property exercise of. ' ,
'Dissimilar comparison no terminated devonshire no literature on. Say most yet head room such just easy. ']


### Count Vector Implementation

Vectorizer picks out unique words and places their count in a vector. We then take vectorizer and format it to a matrix using a transform function. When we print out the matrix, it will display each unique word in a column and how many times each document (row) has used them.

In [None]:
import sklearn.feature_extraction.text as sk_text

#min_df is set to 2 to keep the matrix from being too cluttered.
vectorizer = sk_text.CountVectorizer(min_df=2)
#vectorizer = sk_text.CountVectorizer(stop_words = 'english')

#min_df: ignore terms that have a document frequency < min_df.

#format the vectorizer into a readable matrix.
matrix = vectorizer.fit_transform(text_dataset)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names_out())

### Tfidf Vector Implementation

TFIDF calculates how relevant a word is to a text. Vectorizer takes the unique words and evaluates them based on the number of times a word appears compared to the frequency in the dataset. We then format the vector into a matrix and print out the result.

In [None]:
vectorizer = sk_text.TfidfVectorizer(
                             #stop_words='english',
                             #max_features = 1000,
                             min_df=2)
#min_df is set to 2 to prevent the matrix from being too cluttered.

#max_features:  build a vocabulary that only consider the top max_features features ordered by term frequency across the corpus.

matrix = vectorizer.fit_transform(text_dataset)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array
np.set_printoptions(precision=4)
print(vectorizer.get_feature_names_out())

2.4) Tfidf (term frequency-inverse document frequency) is a measure of how frequent a word appears in a set of documents. It is generally used in text analysis algorithms and for document searching. For example, Google search uses Tfidf for text preprocessing.

# Part 3:  Artificial Neural Network (ANN)

## ANN Implementation
In this section, we will be performing ANN techniques on the Admission dataset.

### Useful functions

In [None]:
import pandas as pd
def change_to_binary_values(df, col_name):
    df[col_name] = (df[col_name] > df[col_name].median()).astype('int')
    
#Function to normalize columns
def normalize_numeric_minmax(df, name):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
        
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
# def encode_text_dummy(df, name):
#     dummies = pd.get_dummies(df[name])
#     for x in dummies.columns:
#         dummy_name = "{}-{}".format(name, x)
#         df[dummy_name] = dummies[x]
#     df.drop(name, axis=1, inplace=True)

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
import collections
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.abc.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

Importing the Admission dataset and displaying it

In [None]:
import pandas as pd
admission_dataset = pd.read_csv("./Admission_Predict_Ver1.1_small_data_set_for_Linear_Regression-1.csv")
admission_dataset = admission_dataset.drop(columns=['Serial No.'])
admission_dataset

### Preprocessing foe ANN: Normalize numerical predictors and binarize the targets for classification.
For a numerical variable X that takes values in the range [a, b] where a < b, </br>we normalize the measurements by subtracting a and dividing by b − a.

In [None]:
normalize_numeric_minmax(admission_dataset, 'GRE Score')
normalize_numeric_minmax(admission_dataset, 'TOEFL Score')
normalize_numeric_minmax(admission_dataset, 'University Rating')
normalize_numeric_minmax(admission_dataset, 'SOP')
normalize_numeric_minmax(admission_dataset, 'LOR ')
normalize_numeric_minmax(admission_dataset, 'CGPA')
change_to_binary_values(admission_dataset, 'Chance of Admit ')
admission_dataset

Now all input features should be in [0, 1] range.

Down below, we will be splitting up the admission dataset into training and testing that will be used to calculate our Mean Sum of Error

In [None]:
X = admission_dataset.drop('Chance of Admit ', axis=1)
y = admission_dataset['Chance of Admit ']

Our testing size is sitting at 20% of the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Classification with sklearn MLPClassifier with 2 hidden layers
Once we have stadardized our training and test dataset, we then apply MLP (Multi-layer perceptron) Classification which comes from the neural network sklearn library.

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

In [None]:
y_pred_score = mlp.predict(X_test_scaled)
y_pred = y_pred_score > 0.5

#### MLP Classification report

In [None]:
print('Accuracy on test data is %.2f' % (accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Rejected', 'Admitted'])
disp.plot()
plt.show()

Using the prediction that uses the MLP Classifier, we then find the Mean sum squarred error. The mean sum squared error shows to us that the error for this dataset when comparing both the tested variable and the predicted variable has a low error value when predicting.

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

### Tensorflow Keras ANN Prediction and Classification

Down below, we will be importing keras for using ANN methods. Make sure that keras and tensorflow is installed in order for the code to work.

In [None]:
import tensorflow as tf

admission_dataset = pd.read_csv("./Admission_Predict_Ver1.1_small_data_set_for_Linear_Regression-1.csv")
admission_dataset = admission_dataset.drop(columns="Serial No.")
admission_dataset

Converting Chance of Admit to become binary values. This is due to ANN using [0, 1]

In [None]:
change_to_binary_values(admission_dataset, 'Chance of Admit ')
admission_dataset

Converting the Chance of Admit column into 'yes' and 'no' values and storing it into a variable called classes

In [None]:
admission_dataset['Chance of Admit '].replace((1, 0), ('yes', 'no'), inplace=False)
classes = encode_text_index(admission_dataset, 'Chance of Admit ')
classes

### Normalize numerical columns and separate features and targets for training.

In [None]:
normalize_numeric_minmax(admission_dataset, 'GRE Score')
normalize_numeric_minmax(admission_dataset, 'TOEFL Score')
normalize_numeric_minmax(admission_dataset, 'University Rating')
normalize_numeric_minmax(admission_dataset, 'SOP')
normalize_numeric_minmax(admission_dataset, 'LOR ')
normalize_numeric_minmax(admission_dataset, 'CGPA')

Create a test data set that will take 40 random rows from the admission set

In [None]:
# Choosing a random sample of 40 rows for our testing
split_index = 460
test_data = admission_dataset[split_index:]
train_data = admission_dataset[:split_index]
test_data.head(10)

In [None]:
X, y = to_xy(train_data, 'Chance of Admit ')
testX, testY = to_xy(test_data, 'Chance of Admit ')

In [None]:
print(X.shape)
print(y.shape)

Create a Neural network with 2 hidden Dense layers with ReLU activations and a final Softmax layer to predict one hot encoded targets.

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(8, input_dim = X.shape[1], activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(tf.keras.layers.Dense(4, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

Define the loss and optimizer and fit model to training data

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.fit(X, y, verbose=0, epochs=1000, batch_size=1000)

Using the model for generating our predicted values

In [None]:
pred = model.predict(testX)
print(pred[0])

In [None]:
pred = np.argmax(pred, axis=1)
true = np.argmax(testY, axis=1)

Outputting our class (which is the Chance of Admit column) and observing our predicted set from our actual set

In [None]:
print("Predicted classes: ", classes[pred])
print("True classes: ", classes[true])

Generating the accuracy from our ANN technique as well as the classification report. As the value is somewhat above average, this tells us that our predicted values are in line with our actual dataset.

In [None]:
print('Accuracy on test data is %.2f' % (accuracy_score(true, pred)))
print(classification_report(true,pred))