In [None]:
# Movie recommendation ML model (For Netflix, prime etc)

# 1. First we build the model and feed it sample data based on the existing users.
# 2. Our model will learn the patterns in our data so that we can ask it to make predictions
# 3. When a new user signs up, we ask our model, "Hey Weh have a new user with this profile. What kind of movies that this user will be interested in?"
# 4. Our model might then state or recommend : "Action movies", "Docuentaries" etc.

# The process to be followed:
# 1. Import the data
# 2. Clean the data
# 3. Split the data into training/ test sets
# 4. Create a model
# 5. Train the model
# 6. Make predictions
# 7. Evaluate and Improve


# As per the recommended amount for testing the accuracy , for testing : 20% of the dataset should be used and for training we have to use the rest of the 80%

In [36]:
#1. Import the data

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# The train_test_split will help us split the dataset into two sets, one for training and one for testing

movies_data = pd.read_csv("movies.csv")
X = movies_data.drop(columns = ['genre'])# X indicates input dataset, drop here make another dataset without the genre column
y = movies_data['genre']# y is the output dataset
x_train , x_test, y_train , y_test = train_test_split(X,y, test_size=0.2) # is the keyword argument that specifies the size of our test dataset, here 0.2 in other words means that we are allocating 20% of our dataset for testing

# When the above split is called , it will end up giving a tuple

model = DecisionTreeClassifier() #This is the ML algo from sckit that will help us perform the predictions
model.fit(x_train.values, y_train.values)# This method called fit takes two datasets, input dataset and output dataset



predictions = model.predict(x_test.values)# This takes a two dimensional array, 21 years and 1 -male
score = accuracy_score(y_test, predictions)
score 

# To get the accuracy of the model , we will compare the predictions given by the model with the actual values of y_test


1.0

In [42]:
# # Model Perisistance
# 1.As soon as we train our model, we save it to a file.
# 2.Next time we want to make predictions, we simply load the model from the file and ask it to make predictions
# 3.We can do that because that model is already trained. So we do not have to re-train it again again.
# 4.The file acts like snapshot of a skilled professional --once they have mastered a task, you can call on them anytime to perform it without
# needing to retrain or reteach them everytime.

#1. Import the data

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib # The package has methods for saving and loading models

# The train_test_split will help us split the dataset into two sets, one for training and one for testing

# movies_data = pd.read_csv("movies.csv")
# X = movies_data.drop(columns = ['genre'])# X indicates input dataset, drop here make another dataset without the genre column
# y = movies_data['genre']# y is the output dataset

# model = DecisionTreeClassifier()
# model.fit(x_train.values, y_train.values)# This method called fit takes two datasets, input dataset and output dataset

model = joblib.load('movie-recommender.joblib')
predictions = model.predict([[29,0]])
predictions


# What is Joblib?
# 1. To put it simply , it is a binary file.
# 2. It saves the model in a compressed, effiicient binary format, which makes it faster to store and load, especially for larger models
# or datasets.
# 3. It is great because it is fast and handles large models well.

# predictions = model.predict(x_test.values)# This takes a two dimensional array, 21 years and 1 -male





array(['Documentary'], dtype=object)

In [43]:
# Visualising decision trees

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

movies_data = pd.read_csv("movies.csv")
X = movies_data.drop(columns = ['genre'])# X indicates input dataset, drop here make another dataset without the genre column
y = movies_data['genre']

model = DecisionTreeClassifier()
model.fit(X.values, y.values)

tree.export_graphviz(model,out_file = 'movies-recommender.dot', feature_names=['age','gender'], class_names = sorted(y.unique()), label = 'all', rounded=True, filled=True)

# 1. feature_names:
# It tells the model what each input represents. In our case, we have two inputs-'age' and 'gender'.
# By passing these as feature_names, we ensure the decision tree uses these labels when showing how it splits and makes decisions.

# 2.class_names :
# class_names tells the model what each possible prediction is. In our case the model predicts movie genres, like 'Action', 'Sci-Fi','Drama' etc 
# By passing these as class_names, we can see these labels in the decision tree, showing which genre the model predicts at each step.

# filled - fills the node with color
# rounded - nodes to have rounded corners
# label - to label all the nodes in the decision trees.