In [81]:
import math
import pandas as pd
import numpy as np
import os

In [82]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## <font color='cyan'>Initializing Variables</font>

In [83]:
heading = {
    "u.data" : ['user_id', 'movie_id', 'rating', 'timestamp'] ,
    "u.genre" : ['genre' , 'genre_id'] ,
    "u.item" : ["movie_id" , "movie_title" , "release_date"  , 
                "video_release_date" , "IMDb_URL" , "unknown" , 
                "Action" , "Adventure" , "Animation" , "Children's" , 
                "Comedy" , "Crime" , "Documentary" , "Drama" , "Fantasy" , 
                "Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" , 
                "Sci-Fi" , "Thriller" , "War" , "Western"] ,
    "u.occupation" : ["occupation"] ,
    "u.user" : ["user id" , "age" , "gender" , "occupation" , "zip code"] ,
    "u1.base" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u1.test" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u2.base" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u2.test" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u3.base" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u3.test" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u4.base" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u4.test" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u5.base" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
    "u5.test" : ['user_id', 'movie_id', 'rating', 'timestamp'] , 
}

delimiter = {
    "u.data" : '\t' ,
    "u.genre" : '|' ,
    "u.item" : '|' ,
    "u.occupation" : ' ' ,
    "u.user" : '|' ,
    "u1.base" : '\t' ,
    "u1.test" : '\t' ,
    "u2.base" : '\t' ,
    "u2.test" : '\t' ,
    "u3.base" : '\t' ,
    "u3.test" : '\t' ,
    "u4.base" : '\t' ,
    "u4.test" : '\t' ,
    "u5.base" : '\t' ,
    "u5.test" : '\t' ,   

}

In [84]:
USERS = 943
MOVIES = 1682

FOLDS = 5
CLUSTERS = [5 , 10 , 20]
CLUSTER_SIZE = len(CLUSTERS)

TRAIN_SIZE = 755
TEST_SIZE = 188

## <font color='cyan'>Reading Data</font>

In [85]:
def read_data(path , file_name , delim , heading):
    data = pd.read_csv(os.path.join(path, file_name), delim , names = heading , encoding="ISO-8859-1")
    return data

In [86]:
def read_file(file):
    return read_data("Data/ml-100k/" , file , delimiter[file] , heading[file])

In [87]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error , accuracy_score
def mae_calculation(actual , predicted):
    return mean_absolute_error(actual, predicted)

## <font color='cyan'>Preprocessing Data</font>

In [88]:
def rating_matrix():

    ratings = read_file('u.data')

    rating_data = np.zeros(shape = (USERS+1 , MOVIES+1) , dtype = int)

    for i in ratings.index:
        user = ratings['user_id'][i]
        movie = ratings['movie_id'][i]
        rating_data[user][movie] = ratings['rating'][i]

    return rating_data

rating_data = rating_matrix()

In [89]:
def dict_generator():

    dict_age = {}

    for i in range(0,100):

        if i<13:
            dict_age[i]=0
        elif i>=13 and i<=20:
            dict_age[i]=1
        elif i>=21 and i<=30:
            dict_age[i]=2
        elif i>=31 and i<=40:
            dict_age[i]=3
        elif i>=41 and i<=50:
            dict_age[i]=4
        elif i>=51 and i<=60:
            dict_age[i]=5
        else:
            dict_age[i]=6

    dict_gen = {}

    dict_gen['M'] = 0
    dict_gen['F'] = 1

    occupations = read_file("u.occupation")

    dict_occ = {}

    for i in occupations.index:
        dict_occ[occupations.iloc[i]['occupation']] = i

    
    return dict_age , dict_gen , dict_occ

dict_age , dict_gen , dict_occ = dict_generator()

In [90]:
def data_reader():

    data_set = read_file('u.user')

    data_set = data_set.sample(frac=1 , random_state=0).reset_index(drop=True)

    return data_set

data_set = data_reader()

## <font color='cyan'>Splitting Data</font>

In [91]:
def data_split(data , fold):

    start = (fold-1)*TEST_SIZE

    test = data.iloc[start : start + TEST_SIZE]
    train = data.drop(test.index)

    return test , train

In [92]:
def data_generator(data_pre , size):

    data_arr = np.zeros(shape=(size,30) , dtype=int)
    uids = []

    for i in range(size):

        uid = data_pre.iloc[i]['user id']
        age = data_pre.iloc[i]['age']
        gen = data_pre.iloc[i]['gender']
        occ = data_pre.iloc[i]['occupation']

        data_arr[i][dict_age[age]] = 1
        data_arr[i][dict_gen[gen] + 7] = 1
        data_arr[i][dict_occ[occ] + 9] = 1

        uids.append(uid)

    return data_arr , uids

## <font color='cyan'>Crisp K-Means</font>

In [93]:
from sklearn.cluster import KMeans

def crip_k_means(data , data_uids , cluster_size):

    kmeans = KMeans(n_clusters=cluster_size, random_state=0).fit(data)

    cluster_movie_data = np.zeros(shape = (cluster_size+1 , MOVIES+1) , dtype=float)
    cluster_movie_count = np.zeros(shape = (cluster_size+1 , MOVIES+1) , dtype=float)

    for i in range(1 , cluster_size+1):
        user = 0
        for j in kmeans.labels_:
            if j==i-1:
                uid = data_uids[user]
                mov = 0
                for k in rating_data[uid]:
                    if k!=0:
                        cluster_movie_data[i][mov] += k
                        cluster_movie_count[i][mov] += 1
                    mov+=1
            user+=1

    for i in range(1,cluster_size+1):
        for j in range(1,MOVIES+1):
            if cluster_movie_count[i][j]!=0:
                cluster_movie_data[i][j] /= cluster_movie_count[i][j]
            else:
                cluster_movie_data[i][j] = 3

    return kmeans , cluster_movie_data

## <font color='cyan'>Fuzzy C-Means</font>

In [94]:
from fcmeans import FCM

def fuzzy_c_means(data , data_uids , cluster_size):

    fcm = FCM(n_clusters=cluster_size , random_state=0)
    fcm.fit(data)

    cluster_scores = fcm.u

    cluster_movie_data = np.zeros(shape = (cluster_size+1 , MOVIES+1) , dtype=float)
    cluster_movie_count = np.zeros(shape = (cluster_size+1 , MOVIES+1) , dtype=float)

    for i in range(1 , cluster_size+1):
        for j in range(TRAIN_SIZE):
            uid = data_uids[j]
            mov = 0
            for k in rating_data[uid]:
                if k!=0:
                    cluster_movie_data[i][mov] += k*cluster_scores[j][i-1]
                    cluster_movie_count[i][mov] += cluster_scores[j][i-1]
                mov+=1

    for i in range(1,cluster_size+1):
        for j in range(1,MOVIES+1):
            if cluster_movie_count[i][j]!=0:
                cluster_movie_data[i][j] /= cluster_movie_count[i][j]
            else:
                cluster_movie_data[i][j] = 3

    return fcm , cluster_movie_data

## <font color='cyan'>Prediction</font>

In [95]:
def predictions(data , data_uids , model , cluster_data):

    prediction_cluster = model.predict(data)

    actual = []
    prediction = []

    j=0
    for i in data_uids:

        cluster = prediction_cluster[j]
        mov = 0

        for k in rating_data[i]:

            if k==0:
                mov+=1
                continue

            actual.append(k)
            prediction.append(round(cluster_data[cluster+1][mov]))

            mov+=1

        j+=1

    return actual , prediction

## <font color='cyan'>Calculating Results</font>

In [96]:
def output_results():

    crisp = np.zeros(shape=(FOLDS+1 , CLUSTER_SIZE) , dtype=float)
    fuzzy = np.zeros(shape=(FOLDS+1 , CLUSTER_SIZE) , dtype=float)

    for f in range(1 , FOLDS+1):

        test , train = data_split(data_set , f)

        data_train , train_uids = data_generator(train , TRAIN_SIZE)
        data_test , test_uids = data_generator(test , TEST_SIZE)
        
        for c in range(CLUSTER_SIZE):

            kmeans , cluster_data = crip_k_means(data_train , train_uids , CLUSTERS[c])
            actual , prediction = predictions(data_test , test_uids , kmeans , cluster_data)
            crisp[f][c] =  mae_calculation(actual , prediction)

            fcm , cluster_data = fuzzy_c_means(data_train , train_uids , CLUSTERS[c])
            actual , prediction = predictions(data_test , test_uids , fcm , cluster_data)
            fuzzy[f][c] =  mae_calculation(actual , prediction)

    
    return crisp , fuzzy

## <font color='cyan'>Create Tables</font>

In [97]:
def table_generator():

    crisp_col_heads = []
    for i in CLUSTERS:
        st = "K=" + str(i)
        crisp_col_heads.append(st)

    fuzzy_col_heads = []
    for i in CLUSTERS:
        st = "C=" + str(i)
        fuzzy_col_heads.append(st)

    row_heads = []
    for i in range(1,FOLDS+1):
        st = "Fold " + str(i)
        row_heads.append(st)
    row_heads.append('Average')

    crisp , fuzzy = output_results()

    crisp_table = pd.DataFrame(columns=crisp_col_heads)
    fuzzy_table = pd.DataFrame(columns=fuzzy_col_heads)

    c_di = {}
    for i in crisp_col_heads:
        c_di[i] = 0

    f_di = {}
    for i in fuzzy_col_heads:
        f_di[i] = 0

    for i in range(FOLDS+1):
        crisp_table = crisp_table.append(c_di , ignore_index=True)
        fuzzy_table = fuzzy_table.append(f_di , ignore_index=True)

    for i in range(1 , FOLDS+1):
        crisp_table.iloc[i-1] = crisp[i]
        fuzzy_table.iloc[i-1] = fuzzy[i]

    crisp_average = []
    for j in range(CLUSTER_SIZE):
        csum = 0
        for i in range(1 , FOLDS+1):
            csum += crisp[i][j]
        crisp_average.append(csum/5)

    fuzzy_average = []
    for j in range(CLUSTER_SIZE):
        fsum = 0
        for i in range(1 , FOLDS+1):
            fsum += fuzzy[i][j]
        fuzzy_average.append(fsum/5)

    crisp_table.iloc[FOLDS] = crisp_average
    fuzzy_table.iloc[FOLDS] = fuzzy_average

    crisp_table.index = row_heads
    fuzzy_table.index = row_heads

    return crisp_table , fuzzy_table

In [98]:
crisp_table , fuzzy_table = table_generator()

In [99]:
show_crisp_table = crisp_table.style.set_caption("Table: Crisp K-Means Clustering").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_crisp_table

Unnamed: 0,K=5,K=10,K=20
Fold 1,0.814964,0.833601,0.864075
Fold 2,0.804468,0.822784,0.841602
Fold 3,0.814117,0.833988,0.850382
Fold 4,0.779515,0.797933,0.812925
Fold 5,0.790836,0.806714,0.831904
Average,0.80078,0.819004,0.840177


In [100]:
show_fuzzy_table = fuzzy_table.style.set_caption("Table: Fuzzy C-Means Clustering").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_fuzzy_table

Unnamed: 0,C=5,C=10,C=20
Fold 1,0.797933,0.797933,0.797933
Fold 2,0.78409,0.78414,0.78419
Fold 3,0.801517,0.801563,0.801517
Fold 4,0.767736,0.767843,0.767896
Fold 5,0.768341,0.768341,0.768243
Average,0.783923,0.783964,0.783956
