Imports

In [7]:
# from Data import Data
# from AlgorithmRunner import AlgorithmRunner
from sklearn import metrics

Original file path

In [8]:
    file_name = './movie_metadata.csv'

Initialize classes

In [9]:
from sklearn import preprocessing, compose, model_selection
from pandas import read_csv, DataFrame, Series
from numpy import ravel, savetxt
from scipy import sparse


# Reads the csv file and save data to memory
class Data:
    def __init__(self, file_name):
        self.file = file_name  # path to file
        self.X = DataFrame  # Data without Labels
        self.y = DataFrame  # Labels

    def preprocess(self):

        # import csv
        data = read_csv(self.file, delimiter=',')

        # save all Attributes excluding content_Rating, movie_imdb_link, plot_keywords
        data.drop(columns=['content_rating', 'movie_imdb_link', 'plot_keywords'], inplace=True)

        # discard entries with any NaN value
        data.dropna(inplace=True)

        #  Handle duplicate movie_tile values
        data.drop_duplicates(subset='movie_title', keep='first', inplace=True)

        # As movie title is now unique we can discard it
        data.drop(columns=['movie_title'], inplace=True)

        # saves imdb score as labels
        self.y = data['imdb_score']

        # Discard label from data
        data.drop(columns=['imdb_score'], axis=1, inplace=True)
#         print(data.head())

        genres = data.pop('genres').str.get_dummies()

    #######
        actor1 = data['actor_1_name']
        actor2 = data['actor_2_name']
        actor3 = data['actor_3_name']
        actors = actor1.str.cat(actor2, sep="|")
        actors = actors.str.cat(actor3, sep="|")

        print(actors.head())
        print("1",genres.head())

        actors = actors.str.get_dummies('|')
        print("2", actors.head())

        data.drop(columns=['actor_1_name','actor_2_name','actor_3_name'], axis=1, inplace=True)
#####
        numerical_columns = data.select_dtypes(include='number').columns
        categorical_columns = data.select_dtypes(exclude='number').columns
#         print(actors.head())
        data = data.join(genres)
        data = data.join(actors) ###### 
#         data.to_csv("features.csv")
        preprocessor = compose.ColumnTransformer(transformers=
                                                 [('num', preprocessing.StandardScaler(), numerical_columns),
                                                  ('cat', preprocessing.OneHotEncoder(), categorical_columns)],
                                                 remainder='passthrough')

        self.X = preprocessor.fit_transform(data)

        self.y = preprocessing.Binarizer(6.95).fit_transform(self.y.to_numpy().reshape(-1, 1))
        self.y = ravel(self.y)
        # savetxt('labels.csv', self.y)
    @staticmethod
    def splitToFiveFolds():
        return model_selection.KFold(n_splits=5, shuffle=False, random_state=1)



In [10]:
from sklearn import neighbors


class AlgorithmRunner:
    def __init__(self, classifier_method):
        self.algorithm = self.select_model(classifier_method)

    @staticmethod
    def select_model(classifier_method):
        if classifier_method == 'KNN':
            return neighbors.KNeighborsClassifier(n_neighbors=10)
        elif classifier_method == 'Rocchio':
            return neighbors.NearestCentroid()
        else:
            print("Error. Expects 'KNN' or 'Rocchio' only.")

    def fit(self, X_train, y_train):
        self.algorithm.fit(X_train, y_train)

    def predict(self, X_test):
        return self.algorithm.predict(X_test)


In [11]:
    data = Data(file_name)
    %timeit data.preprocess()
    kf = data.splitToFiveFolds()
    classifierKNN = AlgorithmRunner('KNN')
    classifierRocchio = AlgorithmRunner('Rocchio')

0           CCH Pounder|Joel David Moore|Wes Studi
1         Johnny Depp|Orlando Bloom|Jack Davenport
2    Christoph Waltz|Rory Kinnear|Stephanie Sigman
3    Tom Hardy|Christian Bale|Joseph Gordon-Levitt
5        Daryl Sabara|Samantha Morton|Polly Walker
Name: actor_1_name, dtype: object
1    Action  Adventure  Animation  Biography  Comedy  Crime  Documentary  Drama  \
0       1          1          0          0       0      0            0      0   
1       1          1          0          0       0      0            0      0   
2       1          1          0          0       0      0            0      0   
3       1          0          0          0       0      0            0      0   
5       1          1          0          0       0      0            0      0   

   Family  Fantasy  ...  Horror  Music  Musical  Mystery  Romance  Sci-Fi  \
0       0        1  ...       0      0        0        0        0       1   
1       0        1  ...       0      0        0        0        0   

KeyboardInterrupt: 

Initialize metric containers

In [None]:
    fold = 0
    KNN_precision = 0
    KNN_recall = 0
    KNN_accuracy = 0
    Rocchio_precision = 0
    Rocchio_recall = 0
    Rocchio_accuracy = 0

Run both classifiers 5 times

In [None]:
for train_index, test_index in kf.split(data.X):
        fold = fold + 1
        X_train, X_test = data.X[train_index], data.X[test_index]
        y_train, y_test = data.y[train_index], data.y[test_index]
        classifierKNN.fit(X_train, y_train)
        predictedKNN = classifierKNN.predict(X_test)
        classifierRocchio.fit(X_train, y_train)
        predictedRocchio = classifierRocchio.predict(X_test)
        expected = y_test
        # print("-------------   Fold", fold, "  --------------------")
        # print("Classification report for KNN classifier \n%s\n"
        #       % (metrics.classification_report(expected, predictedKNN)))
        #
        # print("Classification report for Rocchio classifier \n%s\n"
        #       % (metrics.classification_report(expected, predictedRocchio)))

        KNN_accuracy += metrics.accuracy_score(expected, predictedKNN)
        Rocchio_accuracy += metrics.accuracy_score(expected, predictedRocchio)

        KNN_precision += metrics.precision_score(expected, predictedKNN, average='binary')
        Rocchio_precision += metrics.precision_score(expected, predictedRocchio, average='binary')

        KNN_recall += metrics.recall_score(expected, predictedKNN, average='binary')
        Rocchio_recall += metrics.recall_score(expected, predictedRocchio, average='binary')        

Results

In [6]:
    print("Question 1:")
    print("KNN classifier {:.5f}, {:.5f}, {:.5f}".format(KNN_precision/5, KNN_recall/5, KNN_accuracy/5))
    print("Rocchio classifier: {:.5f}, {:.5f}, {:.5f}".format(Rocchio_precision/5, Rocchio_recall/5, Rocchio_accuracy/5))

Question 1:


NameError: name 'KNN_precision' is not defined

In [None]:
    print("Question 1:")
    print("KNN classifier", KNN_precision/5, KNN_recall/5, KNN_accuracy/5)
    print("Rocchio classifier:",Rocchio_precision/5, Rocchio_recall/5, Rocchio_accuracy/5)

Question 1:
KNN classifier 0.8202479887559327 0.45451910227798004 0.7700614416852576
Rocchio classifier: 0.6985244985751605 0.5778635901771093 0.7241107413231905

Question 1: columns
KNN classifier 0.8188338914943196 0.454316246132952 0.7697900742420363
Rocchio classifier: 0.6973463948052689 0.5787219592758218 0.7238404710529203