# Summary of This Notebook

In this notebook, our main goal is to prepare our trainning and testing datasets by combining all the the information from Last.fm, Seatgeek and City.Data. In order to keep trainning and testing separated, we didn't do any normalization here. Nonetheless, we unified some string format and drop all the NAs.

# 1. Connect to Database

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import MySQLdb as mdb
import pandas as pd
%matplotlib inline

In [2]:
con = mdb.connect(host = 'your IP', 
                  user = 'root',
                  database = 'My_Project',
                  passwd = '*****', 
                  charset='utf8', use_unicode=True);

# 1.1 Import data from seatgeek_artist

In [3]:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute('''SELECT * FROM seatgeek_artists 
            ORDER BY popularity DESC 
            ''')
artists = cur.fetchall()
cur.close()

artists = pd.DataFrame.from_records(artists)
print(artists.shape)
artists.head()

(5925, 3)


Unnamed: 0,artist,genres,popularity
0,taylor-swift,country/pop/rock/folk/,0.89
1,charli-xcx,pop/rock/,0.88
2,ed-sheeran,pop/rock/folk/,0.88
3,snow-patrol,pop/rock/alternative/classic-rock/,0.88
4,eric-clapton,pop/rock/blues/classic-rock/,0.87


In [4]:
artists.columns = ["artist","genres", "concert_popularity"]
print(artists.shape)
artists.head()

(5925, 3)


Unnamed: 0,artist,genres,concert_popularity
0,taylor-swift,country/pop/rock/folk/,0.89
1,charli-xcx,pop/rock/,0.88
2,ed-sheeran,pop/rock/folk/,0.88
3,snow-patrol,pop/rock/alternative/classic-rock/,0.88
4,eric-clapton,pop/rock/blues/classic-rock/,0.87


# 1.2 Import data from lastfm_cleaned 

Since the formatting of artists' names in Last.fm is different from the formatting of Seatgeek, we unified the format by getting the lowercase of names and replace space with "-".

In [5]:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute('''SELECT * FROM lastfm
            
            ''')
lastfm = cur.fetchall()
cur.close()

In [6]:
lastfm = pd.DataFrame.from_records(lastfm).drop(columns = ["image_url"])

In [7]:
print(lastfm.shape)
lastfm.head()

(10563, 2)


Unnamed: 0,artist,playcount
0,!!!,9821243
1,"""Weird Al"" Yankovic",16580264
2,$uicideboy$,13202244
3,'Til Tuesday,832931
4,(G)I-DLE,893650


In [8]:
lastfm["artist"] = lastfm["artist"].apply(lambda x: x.replace(" ", "-"))
lastfm["artist"] = lastfm["artist"].apply(lambda x: x.lower())
artist_name_list = lastfm["artist"]
artist_name_list[:5]

0                    !!!
1    "weird-al"-yankovic
2            $uicideboy$
3           'til-tuesday
4               (g)i-dle
Name: artist, dtype: object

In [9]:
lastfm.sort_values(by = "playcount",ascending=False).head(10)

Unnamed: 0,artist,playcount
8896,the-beatles,505428022
7438,radiohead,490269805
1937,coldplay,354878559
6477,muse,338136080
594,arctic-monkeys,323595920
7219,pink-floyd,306576636
7531,red-hot-chili-peppers,288558091
5445,linkin-park,288314785
5197,lady-gaga,273827943
6134,metallica,273444318


In [10]:
lastfm.sort_values(by = "playcount",ascending=True).head(10)

Unnamed: 0,artist,playcount
5900,mark-souzek,3190
5379,life-below-elephants,3350
9320,the-sandbox,3363
10135,will-butler-vs-the-knocks,3614
4187,international-special,3679
5430,limestone-quarry,3866
7552,réka-ioescu,3875
8584,subtrailss,3889
10396,zephyros,3953
8759,taz-&-meeks,4212


# 1.3 Import data from seatgeek_concerts

In [11]:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute('''SELECT * FROM seatgeek_concerts
            
            ''')
concerts = cur.fetchall()
cur.close()

In [12]:
concerts = pd.DataFrame.from_records(concerts)
concerts.head()

Unnamed: 0,address,artist,average_price,concert_date,good_deals,highest_price,latitude,longitude,lowest_price,update_time,venue
0,"1151 North High Street, Columbus, OH 43201",10-years,,2018-11-28,,,40,-83,,2018-11-06 04:39:04,skully's-music-diner
1,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-08,,,36,-84,,2018-11-06 04:39:04,skully's-music-diner
2,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-15,,,30,-86,,2018-11-06 04:39:04,skully's-music-diner
3,"1151 North High Street, Columbus, OH 43201",10-years,179.0,2018-12-28,,584.0,40,-83,114.0,2018-11-06 04:39:04,skully's-music-diner
4,"1151 North High Street, Columbus, OH 43201",10-years,124.0,2018-12-29,,584.0,41,-85,58.0,2018-11-06 04:39:04,skully's-music-diner


In [13]:
concerts["city"] = concerts['address'].str.split(',').str[1]
concerts["city"] = concerts["city"].apply(lambda x: x.replace(" ", ""))
print(concerts.shape)
concerts.head()

(16981, 12)


Unnamed: 0,address,artist,average_price,concert_date,good_deals,highest_price,latitude,longitude,lowest_price,update_time,venue,city
0,"1151 North High Street, Columbus, OH 43201",10-years,,2018-11-28,,,40,-83,,2018-11-06 04:39:04,skully's-music-diner,Columbus
1,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-08,,,36,-84,,2018-11-06 04:39:04,skully's-music-diner,Columbus
2,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-15,,,30,-86,,2018-11-06 04:39:04,skully's-music-diner,Columbus
3,"1151 North High Street, Columbus, OH 43201",10-years,179.0,2018-12-28,,584.0,40,-83,114.0,2018-11-06 04:39:04,skully's-music-diner,Columbus
4,"1151 North High Street, Columbus, OH 43201",10-years,124.0,2018-12-29,,584.0,41,-85,58.0,2018-11-06 04:39:04,skully's-music-diner,Columbus


In [14]:
concerts.drop(columns = ["update_time"],inplace = True)
print(concerts.shape)
total_concerts = concerts
total_concerts.head()

(16981, 11)


Unnamed: 0,address,artist,average_price,concert_date,good_deals,highest_price,latitude,longitude,lowest_price,venue,city
0,"1151 North High Street, Columbus, OH 43201",10-years,,2018-11-28,,,40,-83,,skully's-music-diner,Columbus
1,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-08,,,36,-84,,skully's-music-diner,Columbus
2,"1151 North High Street, Columbus, OH 43201",10-years,,2018-12-15,,,30,-86,,skully's-music-diner,Columbus
3,"1151 North High Street, Columbus, OH 43201",10-years,179.0,2018-12-28,,584.0,40,-83,114.0,skully's-music-diner,Columbus
4,"1151 North High Street, Columbus, OH 43201",10-years,124.0,2018-12-29,,584.0,41,-85,58.0,skully's-music-diner,Columbus


# 1.4 Import data from City

In [15]:
#Population Data
city = pd.read_csv("us_city_population.csv")
city["City"] = city['City'].str.split(',').str[0]
city["City"] = city['City'].str.split(' ').str[:-1]

def turn_string(row):
    city_string = ''.join(row["City"])
    return city_string
    

city["city"] = city.apply(lambda row: turn_string(row), axis=1)
city = city.drop(columns = ["City", "Index"])
print(city.shape)
city.head()

(769, 2)


Unnamed: 0,Population_Estimate_2017,city
0,8622698,NewYork
1,3999759,LosAngeles
2,2716450,Chicago
3,2312717,Houston
4,1626078,Phoenix


# 2. Aggregate data

# 2.1 Merge Last.fm Artists and Seatgeek Artists

In [16]:
lastfm_seatgeek = pd.merge(artists, lastfm, on='artist')
#lastfm_seatgeek = lastfm_seatgeek.drop(columns = ['artist_name'],axis = 1)

print(lastfm_seatgeek.shape)
lastfm_seatgeek.head()


(5924, 4)


Unnamed: 0,artist,genres,concert_popularity,playcount
0,taylor-swift,country/pop/rock/folk/,0.89,160372057
1,charli-xcx,pop/rock/,0.88,26468108
2,ed-sheeran,pop/rock/folk/,0.88,81588932
3,snow-patrol,pop/rock/alternative/classic-rock/,0.88,88180099
4,eric-clapton,pop/rock/blues/classic-rock/,0.87,50222194


# 2.2 Merge Artist Info and Seatgeek Concert Info

In [17]:
dataset = pd.merge(total_concerts, lastfm_seatgeek, on='artist')
dataset.sort_values("concert_popularity", ascending=False,inplace=True)
dataset = dataset.drop(columns = ["good_deals"])
print(dataset.shape)
dataset.head()

(15592, 13)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount
12493,"2 Beasley Avenue, Auckland, New Zealand",taylor-swift,1491.0,2018-11-19,2103.0,36,140,968.0,mt-smart-stadium,Auckland,country/pop/rock/folk/,0.89,160372057
12492,"2 Beasley Avenue, Auckland, New Zealand",taylor-swift,656.0,2018-11-08,950.0,-37,175,185.0,mt-smart-stadium,Auckland,country/pop/rock/folk/,0.89,160372057
4001,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,196.0,2018-11-08,1327.0,28,-83,49.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932
11944,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,120.0,2018-11-01,608.0,30,-90,40.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099
11953,"Hoffnigstrasse 1, Dubendorf, Switzerland",snow-patrol,201.0,2019-04-25,584.0,34,-84,100.0,samsung-hall,Dubendorf,pop/rock/alternative/classic-rock/,0.88,88180099


# 2.3 Merge Concert Info and City Info

In [18]:
dataset = pd.merge(dataset, city, on='city')
print(dataset.shape)
dataset.head()

(12934, 14)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount,Population_Estimate_2017
0,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,196.0,2018-11-08,1327.0,28,-83,49.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292
1,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,120.0,2018-11-01,608.0,30,-90,40.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292
2,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,346.0,2018-11-04,1750.0,30,-95,113.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292
3,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,119.0,2018-11-01,950.0,30,-90,36.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292
4,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,83510.0,2018-11-04,12148781.0,30,-95,139.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292


In [19]:
market_heat = dataset.groupby(by = "city", as_index = False)["artist"].count().reset_index()
market_heat = market_heat.drop(columns = "index")
market_heat.columns = ["city", "market_heat"]
print(market_heat.shape)
market_heat.head()

(296, 2)


Unnamed: 0,city,market_heat
0,Akron,28
1,Albany,57
2,Albuquerque,38
3,Alexandria,39
4,Allen,8


In [20]:
dataset = pd.merge(dataset, market_heat, on='city')
print(dataset.shape)
dataset.head()

(12934, 15)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount,Population_Estimate_2017,market_heat
0,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,196.0,2018-11-08,1327.0,28,-83,49.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125
1,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,120.0,2018-11-01,608.0,30,-90,40.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292,125
2,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,346.0,2018-11-04,1750.0,30,-95,113.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292,125
3,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,119.0,2018-11-01,950.0,30,-90,36.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125
4,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,83510.0,2018-11-04,12148781.0,30,-95,139.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125


# 2.4 Drop NAs

In [21]:
dataset = dataset.dropna()
print(dataset.shape)
dataset.head()

(9594, 15)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount,Population_Estimate_2017,market_heat
0,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,196.0,2018-11-08,1327.0,28,-83,49.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125
1,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,120.0,2018-11-01,608.0,30,-90,40.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292,125
2,"1500 Sugar Bowl Drive, New Orleans, LA 70112",snow-patrol,346.0,2018-11-04,1750.0,30,-95,113.0,mercedes-benz-superdome,NewOrleans,pop/rock/alternative/classic-rock/,0.88,88180099,393292,125
3,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,119.0,2018-11-01,950.0,30,-90,36.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125
4,"1500 Sugar Bowl Drive, New Orleans, LA 70112",ed-sheeran,83510.0,2018-11-04,12148781.0,30,-95,139.0,mercedes-benz-superdome,NewOrleans,pop/rock/folk/,0.88,81588932,393292,125


# 3. Separate Data into Test and Train
Here we used train_test_split to split the dataset into two samples, namely 80% training and 20% testing.

In [22]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, shuffle = True)

In [23]:
train = train.dropna()
print(train.shape)
train.head()

(7675, 15)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount,Population_Estimate_2017,market_heat
2481,"2135 Queens Chapel Road Northeast, Washington,...",snails,120.0,2018-12-09,584.0,45,-93,60.0,echostage,Washington,electronic/techno/,0.48,250992,693972,379
3029,"722 East Burnside Street, Portland, OR 97214",whitechapel,131.0,2018-12-13,584.0,30,-98,46.0,the-bossanova-ballroom,Portland,pop/rock/alternative/hard-rock/,0.44,11648404,66882,478
4775,"3790 Wilshire Blvd., Los Angeles, CA 90010",jorja-smith,96.0,2018-12-12,584.0,39,-77,50.0,the-wiltern,LosAngeles,soul/rnb/,0.65,2977461,3999759,596
10136,"6161 Delmar Blvd, St. Louis, MO 63112",julien-baker,101.0,2018-11-28,580.0,38,-122,51.0,the-pageant,St.Louis,rock/alternative/,0.48,3156749,308626,83
12335,"1875 Newport Blvd., Costa Mesa, CA 92627",slushii,21.0,2018-12-23,24.0,33,-117,18.0,time-nightclub,CostaMesa,electronic/techno/,0.46,524225,113825,13


In [24]:
train.describe()

Unnamed: 0,average_price,highest_price,latitude,longitude,lowest_price,playcount,Population_Estimate_2017,market_heat
count,7675.0,7675.0,7675.0,7675.0,7675.0,7675.0,7675.0,7675.0
mean,234.175505,4189.519,38.183062,-94.927948,95.963127,12423930.0,979085.8,212.547101
std,1089.807231,196914.6,5.310159,17.622408,158.951415,25708900.0,1613363.0,177.800802
min,13.0,13.0,10.0,-158.0,6.0,13851.0,47929.0,1.0
25%,108.0,562.0,34.0,-115.0,49.0,1191805.0,185038.0,58.0
50%,148.0,584.0,39.0,-90.0,66.0,3643560.0,486290.0,157.0
75%,232.0,629.0,42.0,-80.0,98.0,12305640.0,879170.0,350.0
max,83510.0,12250020.0,61.0,9.0,5913.0,273827900.0,8622698.0,596.0


In [25]:
test.describe()

Unnamed: 0,average_price,highest_price,latitude,longitude,lowest_price,playcount,Population_Estimate_2017,market_heat
count,1919.0,1919.0,1919.0,1919.0,1919.0,1919.0,1919.0,1919.0
mean,221.936946,1091.900469,38.083898,-94.517978,98.337676,11428330.0,942128.8,213.569046
std,486.94587,4609.240653,5.311245,17.453037,321.897903,24463980.0,1549252.0,177.89192
min,13.0,13.0,18.0,-158.0,9.0,29119.0,47929.0,1.0
25%,106.0,559.0,34.0,-115.0,48.0,1112298.0,194058.0,56.5
50%,147.0,584.0,39.0,-88.0,64.0,3187672.0,486290.0,157.0
75%,229.0,619.0,42.0,-80.0,97.0,10955870.0,724745.0,350.0
max,12314.0,179913.0,55.0,-67.0,12314.0,273827900.0,8622698.0,596.0


In [26]:
path = "train_final.csv"
train.to_csv(path_or_buf = path)

In [27]:
test = test.dropna()
print(test.shape)
test.head()

(1919, 15)


Unnamed: 0,address,artist,average_price,concert_date,highest_price,latitude,longitude,lowest_price,venue,city,genres,concert_popularity,playcount,Population_Estimate_2017,market_heat
2085,"287 Tampa Avenue South, Orlando, FL 32805",gryffin,471.0,2018-12-01,14413.0,34,-118,73.0,tinker-field,Orlando,electronic/techno/,0.57,1183814,280257,164
8875,"68 Commerce, Grand Rapids, MI 49503",mewithoutyou,108.0,2018-11-07,584.0,46,-123,44.0,the-pyramid-scheme,GrandRapids,pop/rock/alternative/,0.44,16658388,198829,76
11647,"125 West Jefferson Blvd, Fort Wayne, IN 46802",joe-bonamassa,257.0,2018-11-08,889.0,40,-83,106.0,embassy-theatre,FortWayne,rock/blues/jazz/classic-rock/,0.57,9520940,265904,24
12760,"104 North Commercial Street, Bellingham, WA 98225",john-hiatt,198.0,2019-02-24,755.0,40,-75,98.0,mount-baker-theatre,Bellingham,country/pop/rock/folk/,0.44,2992062,89045,7
5855,"3200 Commerce Street, Dallas, TX 75226",clozee,135.0,2018-12-29,584.0,54,-114,57.0,deep-ellum-art-co.,Dallas,electronic/techno/,0.49,387276,1341075,279


In [28]:
path = "test_final.csv"
test.to_csv(path_or_buf = path)