# Predicting the Geographic Origin of Music

### The size of the dataset  
The dataset contains 1059 rows representing music tracks and 70 columns. Of the 70 columns, 68 represent features of music, and 2 represent lattitude and longitude, the geographical origin of the music track. 
       
The size of the dataset is 678KB

The memory footprint is 601KB


### The types of the data 
  
   - float with 6 digits of precision for all music features and 
   - float with 2 digits of precision for lattitude and longitude.

### Read data from csv file ../data/default_features_1059_tracks.txt

In [28]:
df <- read.csv("../data/default_features_1059_tracks.txt", header = FALSE)

In [42]:
colnames(df) <- c(c(paste("Feature", c(1:68), "")), c("Latitude", "Longitude"))

head(df)

Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,⋯,Feature 61,Feature 62,Feature 63,Feature 64,Feature 65,Feature 66,Feature 67,Feature 68,Latitude,Longitude
7.161286,7.835325,2.911583,0.984049,-1.499546,-2.094097,0.576,-1.205671,1.849122,-0.425598,⋯,-1.504263,0.351267,-1.018726,-0.174878,-1.089543,-0.66884,-0.914772,-0.83625,-15.75,-47.95
0.225763,-0.094169,-0.603646,0.497745,0.874036,0.29028,-0.077659,-0.887385,0.432062,-0.093963,⋯,-0.495712,-0.465077,-0.157861,-0.157189,0.380951,1.088478,-0.123595,1.391141,14.91,-23.51
-0.692525,-0.517801,-0.788035,1.214351,-0.907214,0.880213,0.406899,-0.694895,-0.901869,-1.701574,⋯,-0.637167,0.14726,0.217914,2.718442,0.972919,2.081069,1.375763,1.063847,12.65,-8.0
-0.735562,-0.684055,2.058215,0.716328,-0.011393,0.805396,1.497982,0.114752,0.692847,0.052377,⋯,-0.178325,-0.065059,-0.724247,-1.020687,-0.75138,-0.385005,-0.012326,-0.392197,9.03,38.74
0.570272,0.273157,-0.279214,0.083456,1.049331,-0.869295,-0.265858,-0.401676,-0.872639,1.147483,⋯,-0.919463,-0.667912,-0.820172,-0.190488,0.306974,0.119658,0.271838,1.289783,34.03,-6.85
0.059217,0.034537,-0.703441,0.188652,-0.270353,1.420526,0.467766,0.209594,-0.496505,1.239908,⋯,0.501017,-0.354263,0.445184,0.060328,0.075293,-0.170387,-0.506187,-0.00847,12.65,-8.0



### Throw away missing data records

In [49]:
df <- na.omit(df)

### Generate Countries from Latitude, Longitude so we can do a Classification by Countries

In [64]:
location <- df[,69:70]
getCountry<-function(lat,lon){
  library(httr)
  address<-paste0("http://api.geonames.org/countryCode?lat=",lat,"&lng=",lon,"&username=sangeethathai")
  resp<-httr::GET(url = address, config = list(type = "JSON"))
  country<-content(resp, "text", encoding = "UTF-8")
  country<-sub("\n","",country)
  country<-sub("\r","",country)
  return(country)
}

In [65]:
Countries <- mapply(getCountry,location[,1],location[,2])



### Set Countries as RowNames

In [68]:
df$Country <- Countries

In [70]:
head(df)

Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,⋯,Feature 62,Feature 63,Feature 64,Feature 65,Feature 66,Feature 67,Feature 68,Latitude,Longitude,Country
7.161286,7.835325,2.911583,0.984049,-1.499546,-2.094097,0.576,-1.205671,1.849122,-0.425598,⋯,0.351267,-1.018726,-0.174878,-1.089543,-0.66884,-0.914772,-0.83625,-15.75,-47.95,BR
0.225763,-0.094169,-0.603646,0.497745,0.874036,0.29028,-0.077659,-0.887385,0.432062,-0.093963,⋯,-0.465077,-0.157861,-0.157189,0.380951,1.088478,-0.123595,1.391141,14.91,-23.51,CV
-0.692525,-0.517801,-0.788035,1.214351,-0.907214,0.880213,0.406899,-0.694895,-0.901869,-1.701574,⋯,0.14726,0.217914,2.718442,0.972919,2.081069,1.375763,1.063847,12.65,-8.0,ML
-0.735562,-0.684055,2.058215,0.716328,-0.011393,0.805396,1.497982,0.114752,0.692847,0.052377,⋯,-0.065059,-0.724247,-1.020687,-0.75138,-0.385005,-0.012326,-0.392197,9.03,38.74,ET
0.570272,0.273157,-0.279214,0.083456,1.049331,-0.869295,-0.265858,-0.401676,-0.872639,1.147483,⋯,-0.667912,-0.820172,-0.190488,0.306974,0.119658,0.271838,1.289783,34.03,-6.85,ERR:15:no country code found
0.059217,0.034537,-0.703441,0.188652,-0.270353,1.420526,0.467766,0.209594,-0.496505,1.239908,⋯,-0.354263,0.445184,0.060328,0.075293,-0.170387,-0.506187,-0.00847,12.65,-8.0,ML


### Filter out records where Country code was not found

In [80]:
df <- df[df$Country !=  "ERR:15:no country code found",]

### Throw away Latitude/Longitude as we no longer need them 

In [83]:
df$Latitude <- NULL
df$Longitude <- NULL

dim(df)
head(df)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,⋯,Feature 60,Feature 61,Feature 62,Feature 63,Feature 64,Feature 65,Feature 66,Feature 67,Feature 68,Country
1,7.161286,7.835325,2.911583,0.984049,-1.499546,-2.094097,0.576,-1.205671,1.849122,-0.425598,⋯,-0.04361,-1.504263,0.351267,-1.018726,-0.174878,-1.089543,-0.66884,-0.914772,-0.83625,BR
2,0.225763,-0.094169,-0.603646,0.497745,0.874036,0.29028,-0.077659,-0.887385,0.432062,-0.093963,⋯,-0.947933,-0.495712,-0.465077,-0.157861,-0.157189,0.380951,1.088478,-0.123595,1.391141,CV
3,-0.692525,-0.517801,-0.788035,1.214351,-0.907214,0.880213,0.406899,-0.694895,-0.901869,-1.701574,⋯,-0.556109,-0.637167,0.14726,0.217914,2.718442,0.972919,2.081069,1.375763,1.063847,ML
4,-0.735562,-0.684055,2.058215,0.716328,-0.011393,0.805396,1.497982,0.114752,0.692847,0.052377,⋯,0.166616,-0.178325,-0.065059,-0.724247,-1.020687,-0.75138,-0.385005,-0.012326,-0.392197,ET
6,0.059217,0.034537,-0.703441,0.188652,-0.270353,1.420526,0.467766,0.209594,-0.496505,1.239908,⋯,1.428818,0.501017,-0.354263,0.445184,0.060328,0.075293,-0.170387,-0.506187,-0.00847,ML
7,-0.280628,-0.310082,-0.756816,0.839033,-0.19667,1.153469,-0.473974,1.491603,-0.105328,1.365001,⋯,1.34057,0.473937,-0.433843,0.813016,1.101685,-0.936123,-0.280096,-0.995073,-0.872726,ML


### Summary data

In [82]:
summary(df)

   Feature 1          Feature 2          Feature 3          Feature 4      
 Min.   :-1.52946   Min.   :-1.47656   Min.   :-1.13367   Min.   :-3.2227  
 1st Qu.:-0.60651   1st Qu.:-0.62063   1st Qu.:-0.61414   1st Qu.:-0.6643  
 Median :-0.18448   Median :-0.21071   Median :-0.31077   Median :-0.1572  
 Mean   :-0.01958   Mean   :-0.02381   Mean   :-0.04888   Mean   :-0.0528  
 3rd Qu.: 0.30545   3rd Qu.: 0.30634   3rd Qu.: 0.15656   3rd Qu.: 0.4587  
 Max.   : 9.43950   Max.   :10.71930   Max.   : 6.15785   Max.   : 5.9269  
   Feature 5          Feature 6           Feature 7          Feature 8        
 Min.   :-3.38639   Min.   :-3.565809   Min.   :-4.29193   Min.   :-5.287408  
 1st Qu.:-0.60139   1st Qu.:-0.622794   1st Qu.:-0.60976   1st Qu.:-0.557304  
 Median : 0.15088   Median : 0.014802   Median : 0.05224   Median : 0.038592  
 Mean   : 0.01536   Mean   : 0.004931   Mean   :-0.01515   Mean   :-0.003774  
 3rd Qu.: 0.69349   3rd Qu.: 0.661854   3rd Qu.: 0.64781   3rd Qu.: 0.547