In [None]:
# Imports and setup
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns

import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
%matplotlib inline  
plt.rcParams["figure.figsize"] = (10, 6) 

In [None]:
# Loading the movies dataset into df1 and printing it to show the result.
df1 = pd.read_csv(r"D:\Movies_Cleaned_Data.csv")
display(df1)

In [None]:
# First thing we did is describing the dataset to check the values. We used pd.option to show all of the columns without skipping
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df1.describe(include="all"))

In [None]:
# Checking for duplicate rows and printing it to show the result.
duplicateRowsDF = df1[df1.duplicated()] 
display(duplicateRowsDF)


In [None]:
# Updating df1 without the duplicates rows and printing it to show the result.
df1 = df1.drop_duplicates(subset=None, keep="first", inplace=False)
df1 = df1.reset_index(drop=True)
df1

In [None]:
# After fixing the values, we'll start showing several plots to understand the values.
# In this cell, we plotted the relation between director name and amount of movies
fig, ax = plt.subplots(figsize=(12,8))
df1.groupby(by="director_name")["Movie Name"].count().sort_values()[2379:2398].plot(kind="bar", fontsize=12)
plt.title("Director Name VS Movie name", fontsize=18)
plt.xlabel("Director Name", fontsize=14)
plt.ylabel("Amount of Movies", fontsize=14)
plt.show()

# As we can see below, Steven Spielberg is in the first place.

In [None]:
# A plot of the relation between first actor name and movie's gross
fig, ax = plt.subplots(figsize=(12,8))
df1.groupby(by="actor_1_name")["gross"].sum().sort_values()[2078:2098].plot(kind="bar", fontsize=12)
plt.title("First Actor Name VS Gross", fontsize=18)
plt.xlabel("First Actor Name", fontsize=14)
plt.ylabel("Gross Sum", fontsize=14)
plt.show()


In [None]:
# Two plots of the relation between county and gross, language and gross
fig, ax = plt.subplots(figsize=(16,12))

plt.subplot(2, 2, 1)
df1.groupby(by="country")["gross"].sum().sort_values()[56:66].plot(kind="bar", fontsize=12)
plt.title("Country VS Gross", fontsize=18)
plt.xlabel("Country", fontsize=14)
plt.ylabel("Gross Sum", fontsize=14)

plt.subplot(2, 2, 2)
df1.groupby(by="language")["gross"].sum().sort_values()[42:47].plot(kind="bar", fontsize=12)
plt.title("Language VS Gross", fontsize=18)
plt.xlabel("Language", fontsize=14)
plt.ylabel("Gross Sum", fontsize=14)
plt.show()


In [None]:
# A plot of the relation between IMDB Score and a movie's budget
fig, ax = plt.subplots(figsize=(12,8))
plt.scatter(x=df1["imdb_score"], y=df1["budget"], marker="D")
plt.ticklabel_format(style="plain")
plt.title("IMDB Score VS Movie's Budget", fontsize=18)
plt.xlabel("IMDB Score", fontsize=14)
plt.ylabel("Movie's Budget", fontsize=14)
plt.show()

In [None]:
# Fixing the missing values with filling None in any Nan category feature and 0.0 or mean in any numeric feature.
df1["director_name"] = df1["director_name"].fillna("None")

df1["actor_1_name"] = df1["actor_1_name"].fillna("None")

df1["budget"] = df1["budget"].fillna(df1["budget"].mean()).astype(np.int)
df1["gross"] = df1["gross"].fillna(0.0).astype(np.float)
df1["num_critic_for_reviews"] = df1["num_critic_for_reviews"].fillna(0.0).astype(np.float)
df1["num_user_for_reviews"] = df1["num_user_for_reviews"].fillna(0.0).astype(np.float)

df1["language"] = df1["language"].fillna("None")
df1["country"] = df1["country"].fillna("None")
df1["title_year"] = df1["title_year"].fillna(df1["title_year"].mean()).astype(np.int)

# Describing df1 again to show the result
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df1.describe(include="all"))

In [None]:
# A plot of the relation between year and IMDB score
major_movies = df1[df1["num_voted_users"] >= 25000]
major_movies.plot.scatter("title_year", "imdb_score", figsize=(12, 8), alpha=0.4)
plt.title("Year VS IMDB Score", fontsize=18)
plt.xlabel("Year", fontsize=14)
plt.ylabel("IMDB Score", fontsize=14)
plt.show()


In [None]:
# A plot of the relation between IMDB Score and a movie's budget
fig, ax = plt.subplots(figsize=(12,8))
plt.scatter(x=df1["imdb_score"], y=df1["budget"], marker="D")
plt.ticklabel_format(style="plain")
plt.title("IMDB Score VS Movie's Budget", fontsize=18)
plt.xlabel("IMDB Score", fontsize=14)
plt.ylabel("Movie's Budget", fontsize=14)
plt.show()

In [None]:
# Setting DF3
df3 = df1[["title_year","budget", "gross", "cast_total_facebook_likes", "num_user_for_reviews"]]

In [None]:
# Setting our X as lables (all our features and data) and y as our target (IMDB score).
y = np.array(df1["imdb_score"], dtype=int)
print("Minimum value:", np.min(y))
print("Median value:", np.median(y))
print("Maximum value:", np.max(y))

X = np.array(df3.values, dtype=int)

In [None]:
# First of all, we developed a k-NN classification model for the data:

# Imports and setup
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Split dataset into train and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create k-NN classifier with n_neighbors=3.
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the data.
knn.fit(X_train, y_train)

# Check accuracy of our model on the test data.
print("Accuracy with k=3 is:", knn.score(X_test, y_test))

# Secondly, we checked which value of k between 1 to 17 step 2 is the best choise of n_neighbors and printed the results.
for k in range(1, 17, 2):
    knn2 = KNeighborsClassifier(n_neighbors=k)
    knn2.fit(X_train, y_train)
    acc = knn2.score(X_test, y_test)
    acc_tr = knn2.score(X_train, y_train)
    print("knn (k={}) Accuracy: {}, train accuracy: {}".format(k, acc, acc_tr))

# Thirdly, we calculated the error for k values between 1 to 17 step 2.
error = []
# Calculating error for k values between 1 to 17 step 2.
for k in range(1, 17, 2):
    knn3 = KNeighborsClassifier(n_neighbors=k)
    knn3.fit(X_train, y_train)
    pred_k = knn3.predict(X_test)
    error.append(np.mean(pred_k != y_test))

# We used a plot to describe the error from above.
plt.figure(figsize=(10,6))
plt.title("Error Rate K Value", fontsize=18)  
plt.xlabel("K Value", fontsize=14)  
plt.ylabel("Mean Error", fontsize=14)
plt.plot(range(1, 17, 2), error, color="red", linestyle="dashed", marker="o", markerfacecolor="blue", markersize=10);

# As we can infer from the output below (text and plot), the best k value between 1 to 17 step 2 is 15.
# It's accurary value is the highest and it's error is the lowest.

In [None]:
# In this cell we did a cross validation using Grid Search to find the best k between 1 to 29.

# We created a dictionary with a key and a value to be k values.
k_range = list(range(1, 31, 2))

param_grid = dict(n_neighbors=k_range)

# Grid search to the data.
grid = GridSearchCV(knn, param_grid, scoring="accuracy", cv=5, return_train_score=True)

# Fit the grid to the data.
grid.fit(X_train, y_train)

# Print the best parameters to show the result.
print("The best parameter set found on development set is:", grid.best_params_)

# Getting the mean values of grid search.
means = grid.cv_results_["mean_test_score"]

# Print the grid scores to show the result.
print("\nGrid scores on development set:")
for mean, params in zip(means, grid.cv_results_["params"]):
    print("mean:", mean, "   parameters:", params)

In [None]:
# In the next 2 cells, we used decision tree classifier

# Imports and setup
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
import pydotplus
from IPython.display import Image

# Hacky solution of writing to files and reading again. necessary due to library bugs.
def renderTree(my_tree, features):
    filename = "temp.dot"
    with open(filename, 'w') as f:
        f = tree.export_graphviz(my_tree, 
                                 out_file=f, 
                                 feature_names=features, 
                                 class_names=["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5", "Rating 6", "Rating 7", "Rating 8" ,"Rating 9"],  
                                 filled=True, 
                                 rounded=True,
                                 special_characters=True)
    dot_data = ""
    with open(filename, 'r') as f:
        dot_data = f.read()
    graph = pydotplus.graph_from_dot_data(dot_data)
    image_name = "temp.png"
    graph.write_png(image_name)  
    display(Image(filename=image_name))


In [None]:
pip install graphviz

In [None]:
# Classifier to the data with max_depth=3.
decisionTree = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=2)

# Splitting into test and train.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Fitting the tree with the training data.
decisionTree = decisionTree.fit(X_train, y_train)

# Predict with the training data.
y_pred_train = decisionTree.predict(X_train)

# Print the result of measure accuracy.
print("The accuracy on training data is:", metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train))

# Predict with the testing data.
y_pred = decisionTree.predict(X_test)

# Print the result of measure accuracy.
print("The accuracy on testing data is:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred))
renderTree(my_tree=decisionTree, features=df3.columns)

In [None]:
from flask import Flask,request,jsonify
from flask_cors import CORS
import recommendation

In [None]:
# In the next cells we used k-means to cluster the movies into four clusters.

# imports and setup 
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

# Storing all the df3 values into X
X = df3.values

# Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Training the model
y_pred = KMeans(n_clusters=2, n_init=1, max_iter=600).fit_predict(X_scaled)

# View in a scatter
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 2], X[:, 5], c=y_pred, marker="o", s=50);

# X axis = director_facebook_likes
# Y axis = movie's gross
print(X.shape)

In [None]:
# Using silhouette score to find the best value of clustering from k=2 to k=6.

# imports and setup 
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

# A list of clusters to be tested
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot. The silhouette coefficient can range from -1, 1 but in this example all lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    
    # The (n_clusters+1)*10 is for inserting blank space between silhouette plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples. This gives a perspective into the density and separation
    # of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters, "The silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to cluster i and sort them
        ith_cluster_silhouette_values =\
        sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color,
                          edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette coefficient of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    
    # Clear the yaxis labels / ticks
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker="o", c="white", alpha=1, s=200, edgecolor="k")

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters), fontsize=14, fontweight="bold")
    plt.show()
    
    
# As we can see, the best silhouette score is 0.71 for k=2, meaning, 2 different clusters. 

In [None]:
# Clustering from k = 1 to k = 10 for searching the best value
ks = range(1, 10)
scores = []

for k in ks:
    model = KMeans(n_clusters=k)
    model.fit_predict(X)
    scores.append(-model.score(X))

    # View in a scatter
plt.figure(figsize=(10, 6))
plt.plot(ks, scores)
plt.ylabel("Total intra-cluster distance", fontsize=14)
plt.xlabel("Value of K", fontsize=14)
plt.show()

# As we can see that, the total intra-cluster distance is large for k=1 and decreases as we increase k value, until k=2, 
# after which it tapers off and gets only marginally smaller. This indicates that k=2 is a good choice.


In [None]:
pip install -U flask-cors

In [None]:
pip install recommendation

In [None]:
app = Flask(__name__)
CORS(app) 
        
@app.route('/movie', methods=['GET'])
def recommend_movies():
    res = recommendation.results(request.args.get('title'))
    return jsonify(res)

if __name__=='__main__':
    app.run(port = 5000, debug = True)