# MongoDB to Pandas Test

## Imports

In [3]:
import requests
import json
import pymongo
import time
import pandas as pd
import numpy as np
from pymongo import MongoClient

### Connect to MongoDB and create a Pandas DataFrame

In [24]:
conn = MongoClient("mongodb://localhost:27017/")
db = conn['gym-music-database']
content_col = db['test_songlist']
cursor = content_col.find({})
df =  pd.DataFrame(list(cursor))

### The data frame

In [27]:
df.tail()

Unnamed: 0,_id,track,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,song_type
0,5e63fc9905d930e697c5f069,Jackie Chan - Keanu Silva Remix,4DFPLQ1A5VX3XJ2EdX4RzU,0.00211,0.885,155324,0.869,8e-06,6,0.0325,-4.939,1,0.0552,126.005,4,0.702,1
1,5e63fc9905d930e697c5f06a,Piece Of Your Heart - Alok Remix,6iW38RGqdDGOofmz2HeXLW,0.0312,0.797,166452,0.86,0.0137,10,0.334,-4.38,0,0.0413,124.033,4,0.192,1
2,5e63fc9905d930e697c5f06b,Heat - Project 98 Remix,0q8XroUPVTaBIkHpyfnyRj,0.0781,0.704,173500,0.822,0.167,0,0.146,-7.12,1,0.0604,129.993,4,0.539,1
3,5e63fc9905d930e697c5f06c,"No Service In The Hills (feat. Trippie Redd, B...",0CmoisdB4maBgLV5MajnL4,0.00445,0.542,153764,0.739,0.0,7,0.306,-4.857,1,0.0395,156.074,4,0.338,1
4,5e63fc9905d930e697c5f06d,All Of My Life - Tigerlily Remix,7DYDXX5MLKwk5HuyMHNEGR,0.00218,0.802,230420,0.893,0.415,8,0.0514,-4.467,1,0.0482,125.011,4,0.397,1


#### Set list of features to learn

In [31]:
feature_cols = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'mode', 'speechiness', 'tempo', 'time_signature']


#### Create feature matrix 'x' 

In [32]:
X = df.loc[:, feature_cols]

In [33]:
X.shape

(6427, 11)

#### Create response vector (what I want to predict)


In [34]:
y = df.song_type

In [35]:
y.shape

(6427,)

## Create Sci-Kit Learn Prediction Model (Logistic Regression)

In [37]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### Create a test matrix for the model to predict if gym or not

In [38]:
test_col = db['prediction_test_songs']
cursor2 = test_col.find({})
test =  pd.DataFrame(list(cursor2))

In [40]:
test.head()

Unnamed: 0,_id,track,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,5e692811ab4feaca56abdc47,Kids,1jJci4qxiYcOHhQR247rEU,0.00076,0.451,302840,0.931,0.0049,9,0.361,-3.871,1,0.0719,122.961,4,0.172
1,5e692811ab4feaca56abdc48,T-Shirt Weather,1uNH7kknB8MVkmh6FfDb6W,0.00126,0.415,194288,0.95,3.9e-05,4,0.133,-4.676,1,0.0621,160.017,4,0.552
2,5e692811ab4feaca56abdc49,Backseat Freestyle,1BR5vhAlaoUiijQ28p6jlN,0.000739,0.546,212653,0.651,0.0,1,0.235,-7.601,1,0.3,77.878,4,0.646
3,5e692811ab4feaca56abdc4a,The Less I Know The Better,4g3Ax56IslQkI6XVfYKVc5,0.0138,0.64,216319,0.755,0.0208,1,0.12,-4.077,0,0.0287,116.883,4,0.744
4,5e692811ab4feaca56abdc4b,Kush on the Yacht,6ESJ3geFCgreZCZeSnaLOS,0.0246,0.778,220453,0.653,0.0,9,0.139,-5.749,1,0.0703,124.981,4,0.345


In [41]:
X_test = test.loc[:, feature_cols]

In [42]:
X_test.shape

(100, 11)

### Predict the songs as new_pred_class

In [43]:
new_pred_class = lr.predict(X_test)

In [44]:
new_pred_class

array(['1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1',
       '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0',
       '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '1',
       '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1',
       '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '0', '1', '1', '1', '1', '1', '0', '1'], dtype=object)

In [48]:
pd.set_option('display.max_rows', 500)
pd.DataFrame({'Track Name': test.track, 'Prediction': new_pred_class})

Unnamed: 0,Track Name,Prediction
0,Kids,1
1,T-Shirt Weather,1
2,Backseat Freestyle,1
3,The Less I Know The Better,1
4,Kush on the Yacht,1
5,Santorini Greece,0
6,God's Plan,1
7,LAMBORGHINI TRUCK (ATLANTA SHIT),1
8,Grief,1
9,Hive (feat. Vince Staples & Casey Veggies),1
