# Machine Learning 
## Final project

 Team members:
+ Oscar Ochoa
+ Victor Manuel Romo
+ Luis Cabello
+ Ana Sepúlveda
+ Enrique Villar

### First ML Model

In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as skl;
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report, plot_confusion_matrix


%matplotlib inline

import os, sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Import library from my src folder    
from src.helpers import identify_highly_correlated_features, one_hot, identify_low_variance_features, identify_missing_data, feature_importance_plot
from src.learning_curve_plot import learning_curve_plot

In [76]:
songs_data = pd.read_pickle("../data/songs_data_final.pkl")

In [77]:
target = songs_data["genre"]
features = songs_data.drop(["genre"], axis = 1)

In [78]:
X_train, X_test, y_train_dt, y_test_dt = train_test_split(features, target, test_size = 0.2, random_state = 8888)

In [79]:
clf_tree = DecisionTreeClassifier()
clf_tree

In [80]:
clf_tree.fit(X_train, y_train_dt)

ValueError: Input X contains infinity or a value too large for dtype('float32').

In [None]:
from sklearn import tree

fig = plt.figure(figsize=(25 , 20))
_ = tree.plot_tree(clf_tree, 
                   feature_names = list(X_train.columns),  
                   max_depth = 3,
                   filled = True)

In [None]:
y_pred_dt = clf_tree.predict(X_test)
y_pred_dt

In [None]:
print("first five predicted values:", y_pred_dt[0:5])
print("first five actual values:", list(y_test_dt[0:5]))

In [None]:
print(y_pred_dt.shape)
print(y_test_dt.shape)


In [None]:
print("accuracy:", round(accuracy_score(y_test_dt, y_pred_dt), 2))
print("recall:" , round(skl.metrics.recall_score(y_test_dt, y_pred_dt, average = 'weighted'), 2))
print("precision:", round(precision_score(y_test_dt, y_pred_dt, average = 'weighted'), 2))
print("f1-score:", round(f1_score(y_test_dt, y_pred_dt,  average = 'weighted'), 2))


In [None]:
# plot confusion matrix

plot_confusion_matrix(clf_tree, X_test, y_test_dt, cmap = plt.cm.Blues);

In [None]:
print(classification_report(y_test_dt, y_pred_dt))

In [None]:
feat_importances = pd.Series(clf_tree.feature_importances_,
                            index = X_train.columns)

feat_importances

In [None]:
feature_importance_plot(clf_tree, X_train, n=5)

# Decision Tree 1.2

In [None]:
songs_data_Top10 = list[songs_data["genre"], songs_data["tempo"], songs_data["speechiness"], songs_data["danceability"], songs_data["energy"], songs_data["valence"]]

In [None]:
target = songs_data["genre"]
features = songs_data.drop(["genre"], axis = 1)

In [None]:
X_train, X_test, y_train_dt, y_test_dt = train_test_split(features, target, test_size = 0.2, random_state = 8888)

In [None]:
clf_tree = DecisionTreeClassifier()
clf_tree

In [None]:
clf_tree.fit(X_train, y_train_dt)

In [None]:
from sklearn import tree

fig = plt.figure(figsize=(25 , 20))
_ = tree.plot_tree(clf_tree, 
                   feature_names = list(X_train.columns),  
                   max_depth = 3,
                   filled = True)

In [None]:
y_pred_dt = clf_tree.predict(X_test)
y_pred_dt

In [None]:
print("first five predicted values:", y_pred_dt[0:5])
print("first five actual values:", list(y_test_dt[0:5]))

In [None]:
# plot confusion matrix

plot_confusion_matrix(clf_tree, X_test, y_test_dt, cmap = plt.cm.Blues);

In [None]:
print(classification_report(y_test_dt, y_pred_dt))

In [None]:
learning_curve_plot(clf_tree, X_train, y_train_dt, scoring = 'r2')