# Part 1: Cluster Analysis

In [None]:
import os
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
import shutil
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing, cluster

%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(data, name):
    le = preprocessing.LabelEncoder()
    data[name] = le.fit_transform(data[name])
    return le.classes_

## KMeans Implementation

In [None]:
imdb_dataset = pd.read_csv("./imdb_dataset.csv")
# Seems the csv file is missing the 1st column name
imdb_dataset.rename(columns={'Unnamed: 0':'id'}, inplace=True)
print("All columns: ", imdb_dataset.columns)
imdb_dataset.head(10)

### Vertical Partitioning
Select features for K-means clustering

In [None]:
db2 = imdb_dataset[['title', 'genre', 'mpaa_rating', 'imdb_rating', 'critics_score', 'audience_rating','audience_score']]
db2

### Preprocess categorical columns and normalize numerical columns
Note: we drop 'title' as is not informative for K-means clustering and 'genre' because we want to use 'genre' later to analyze our clusters

In [None]:
db2_preprocessed = pd.get_dummies(db2.drop(columns=['title', 'genre']), dtype=int)
# Z-score each column
db2_preprocessed = (db2_preprocessed-db2_preprocessed.mean()) / db2_preprocessed.std()
db2_preprocessed


### Investigate the optimal number of clusters for K-means

In [None]:
print("All genres: ", db2['genre'].unique())
print("Number of different movie genres in the dataset: ", len(imdb_dataset['genre'].unique()))

### Plot SSE vs # of clusters

In [None]:
numClusters = range(1, 40)
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k, n_init=10)
    k_means.fit(db2_preprocessed)
    SSE.append(k_means.inertia_)

plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.plot(numClusters, SSE, marker='o', color='b')

Choosing n_clusters=9 or n_clusters=16 looks like a good fit on the elbow line. We pick n_clusters=16.

In [None]:
n_clusters=16

### KMeans Cluster
Split data and keep a small portion (10%) for analyzing predictions.

In [None]:
split_ind = int(len(db2_preprocessed) * 0.9)
data_train = db2_preprocessed[:split_ind]
data_test = db2_preprocessed[split_ind:]
print(f"Train samples: {len(data_train)}, Test samples: {len(data_test)}")

### KMeans clustering algorithm

In [None]:
k_means = cluster.KMeans(n_clusters=n_clusters, max_iter=100, n_init=10, random_state=1)
k_means.fit(data_train) 
labels = k_means.labels_
print("Unique cluster ids: ", np.unique(labels))
clusters_train_df = pd.DataFrame(labels, index=db2.title[:split_ind], columns=['Cluster ID'])
clusters_train_df


Append 'genre' column to analyze our clusters on training data

In [None]:
# Append 'genre' column to analyze our clusters
clusters_train_df['genre'] = db2.genre[:split_ind].values
clusters_train_df.head(10)


Analyze clusters for genre composition.
Ideally clusters should show grouping of similar genres. </br>
Our clusters have good genre composition as seen below.

In [None]:
print("Genre composition for cluster 1")
print(clusters_train_df.groupby(['Cluster ID', 'genre']).size()[1])

print("Genre composition for cluster 2")
print(clusters_train_df.groupby(['Cluster ID', 'genre']).size()[2])


### Try to visualize our clusters in 2 dimensions
We project the training data to 2D with PCA and then color each sample (movie) with the cluster id color.

In [None]:
from sklearn.decomposition import PCA, KernelPCA
import seaborn as sns

plt.rcParams['figure.figsize'] = [16, 8]
fig, axes = plt.subplots(nrows=1,ncols=2)

data_train_2D = pd.DataFrame(KernelPCA(n_components=2, kernel='linear').fit_transform(data_train), columns=['PC1', 'PC2'])
data_train_2D.plot.scatter(x='PC1', y='PC2', c=clusters_train_df['Cluster ID'], colormap='tab20c', ax = axes[0], subplots=True)

color_labels = clusters_train_df['genre'].unique()
rgb_values = sns.color_palette("Set2", 11)
color_map = dict(zip(color_labels, rgb_values))
data_train_2D.plot.scatter(x='PC1', y='PC2', c=clusters_train_df['genre'].map(color_map), title='PCA projection of training data colored by cluster ID (left) and by genre (right)', ax = axes[1], subplots=True)

In [None]:
plt.rcParams['figure.figsize'] = [12, 8]
centroids = k_means.cluster_centers_
centroids_df = pd.DataFrame(centroids,columns=data_train.columns)
pd.DataFrame(KernelPCA(n_components=2, kernel='rbf').fit_transform(centroids_df), columns=['PC1', 'PC2']) \
    .plot.scatter(x='PC1', y='PC2', title="2D Visualization (PCA projection) of KMeans centroids")

### Apply trained KMeans algorithm to the held out data.

In [None]:
# Compute cluster labels for unseen movies using trained KMeans
labels = k_means.predict(data_test)
labels = labels.reshape(-1,1)
# Print SSE on test
print("Model inertia: ", k_means.inertia_)

# Create a dataframe that has new movies and their cluster assignment
newmovies = db2[split_ind:].copy()
newmovies['Cluster ID'] = labels
print("Cluster allocation for new, unused in training movies")
newmovies

## Hierarchical Analysis on the IMDB dataset

In [None]:
imdb_dataset

## Single Link

In [None]:
encode_text_index(imdb_dataset, 'title_type')
encode_text_index(imdb_dataset, 'mpaa_rating')
encode_text_index(imdb_dataset, 'critics_rating')
encode_text_index(imdb_dataset, 'audience_rating')
encode_text_index(imdb_dataset, 'best_pic_nom')
encode_text_index(imdb_dataset, 'best_pic_win')
encode_text_index(imdb_dataset, 'best_actor_win')
encode_text_index(imdb_dataset, 'best_actress_win')
encode_text_index(imdb_dataset, 'best_dir_win')
encode_text_index(imdb_dataset, 'top200_box')
imdb_dataset

Limiting the dataset so clustering plot is more readable

In [None]:
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
%matplotlib inline


Y = imdb_dataset['genre']
X = imdb_dataset.drop(['id', 'title','genre', 'runtime', 'studio', 'thtr_rel_year', 'thtr_rel_month', 'thtr_rel_day', 'dvd_rel_year', 'dvd_rel_month', 'dvd_rel_day',
               'director', 'actor1', 'actor2', 'actor3', 'actor4', 'actor5', 'imdb_url', 'rt_url'],axis=1)

# Minimizing the rows by choosing 40 random movies
names = imdb_dataset['title'].sample(n=40, random_state=0)
X = X.sample(n=40, random_state=0)

Z = hierarchy.linkage(X.values, 'single')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

### Complete Link

In [None]:
Z = hierarchy.linkage(X.values, 'complete')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

### Group Average

In [None]:
Z = hierarchy.linkage(X.values, 'average')
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')

# Part 2: Text Mining

### Dataset for text mining:

In [None]:
text_dataset = [ 'Now for manners use has company believe parlors.',
'Least nor party who wrote while did. Excuse formed as is agreed admire so on result parish.',
'Put use set uncommonly announcing and travelling. Allowance sweetness direction to as necessary.',
'Principle oh explained excellent do my suspected conveying in.',
'Excellent you did therefore perfectly supposing described. ',
'Its had resolving otherwise she contented therefore.',
'Afford relied warmth out sir hearts sister use garden.',
'Men day warmth formed admire former simple.',
'Humanity declared vicinity continue supplied no an. He hastened am no property exercise of. ' ,
'Dissimilar comparison no terminated devonshire no literature on. Say most yet head room such just easy. ']


### Count Vector Implementation

In [None]:
import sklearn.feature_extraction.text as sk_text

vectorizer = sk_text.CountVectorizer(min_df=2)
#vectorizer = sk_text.CountVectorizer(stop_words = 'english')

#min_df: ignore terms that have a document frequency < min_df.

matrix = vectorizer.fit_transform(text_dataset)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names_out())

### Tfidf Vector Implementation

In [None]:
vectorizer = sk_text.TfidfVectorizer(
                             #stop_words='english',
                             #max_features = 1000,
                             min_df=2)


#max_features:  build a vocabulary that only consider the top max_features features ordered by term frequency across the corpus.

matrix = vectorizer.fit_transform(text_dataset)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names_out())

2.4) Tfidf (term frequency-inverse document frequency) is a measure of how frequent a word appears in a set of documents. It is generally used in text analysis algorithms and for document searching. For example, Google search uses Tfidf for text preprocessing.

# Part 3:  Artificial Neural Network (ANN)

### ANN Implementation

In [None]:
import pandas as pd
def change_to_binary_values(df, col_name):
    df[col_name] = (df[col_name] > df[col_name].median()).astype('int')
    
#Function to normalize columns
def normalize_numeric_minmax(df, name):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
        
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name], prefix=name)
    df = pd.concat([df, dummies], axis=1)
    df.drop(name, axis=1, inplace=True)
    return df

In [None]:
import pandas as pd
admission_dataset = pd.read_csv("./Admission_Predict_Ver1.1_small_data_set_for_Linear_Regression-1.csv")
admission_dataset

In [None]:
change_to_binary_values(admission_dataset, 'GRE Score')
change_to_binary_values(admission_dataset, 'TOEFL Score')
change_to_binary_values(admission_dataset, 'University Rating')
change_to_binary_values(admission_dataset, 'SOP')
change_to_binary_values(admission_dataset, 'LOR ')
change_to_binary_values(admission_dataset, 'CGPA')
admission_dataset

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

admission_dataset = encode_text_dummy(admission_dataset, 'University Rating')

In [None]:
X = admission_dataset.drop('Chance of Admit ', axis=1)
y = admission_dataset['Chance of Admit ']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)

In [None]:
y_pred = mlp.predict(X_test_scaled)

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")