Install dependencies

In [None]:
!pip install pandas
!pip install numpy
!pip install jupyter-dash
!pip install matplotlib
!pip install psutil
!pip install sklearn
!pip install seaborn

Import requirements

In [None]:
import pandas as pd
import os.path
from datetime import date
import seaborn as sns
sns.set_palette("muted")
sns.set(style="ticks", color_codes=True)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

Ignore seaborn warnings:

In [None]:
import warnings
warnings.filterwarnings("ignore")

Define color and markup style definitions:

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# Import chart data

Load global charts and track features

In [None]:
featureData = pd.read_csv('data/track_features.csv')

Function to import data by region group

In [None]:
def load_charts_for_codes(codes):
    temp_df = pd.DataFrame()
    for code in codes["code"]:
        if os.path.isfile("data/chart_{}.csv".format(code.lower())):
            temp_df = temp_df.append(pd.read_csv("data/chart_{}.csv".format(code.lower())))
    return temp_df

Load regional chart data

In [None]:
#Europe
codes = pd.read_csv("data/EuropeCode.csv")
chartDataEurope = load_charts_for_codes(codes)

#Asia
codes = pd.read_csv('data/AsiaCode.csv')
chartDataAsia = load_charts_for_codes(codes)

#NorthAmerica
codes = pd.read_csv('data/NorthAmericaCode.csv')
chartDataNorthAmerica = load_charts_for_codes(codes)

#SouthAmerica
codes = pd.read_csv('data/SouthAmericaCode.csv')
chartDataSouthAmerica = load_charts_for_codes(codes)

#Africa
codes = pd.read_csv('data/AfricaCode.csv')
chartDataAfrica = load_charts_for_codes(codes)

#Oceania
codes = pd.read_csv('data/OcianiaCode.csv')
chartDataOceania = load_charts_for_codes(codes)

# Merging

## prepare data of countries

Function to join chart tracks and track features

In [None]:
def join_chart_features_continents(chart):
    info = chart.merge(featureData, left_on='track_id', right_on='track_id')
    info = info.groupby('date', as_index=False).mean()
    return info

Join features and tracks per region and continent

In [None]:
meanEurope = join_chart_features_continents(chartDataEurope)
meanAsia = join_chart_features_continents(chartDataAsia)
meanNorthAmerica = join_chart_features_continents(chartDataNorthAmerica)
meanSouthAmerica = join_chart_features_continents(chartDataSouthAmerica)
meanOceania = join_chart_features_continents(chartDataOceania)

Function to join region data

In [None]:
def join_region_data(data, region):
    tempDf = data
    tempDf["region"] = region
    tempDf = tempDf.drop("Position", axis=1)
    # tempDf = tempDf.drop("date", axis=1)
    return tempDf

In [None]:
frames = [
    join_region_data(meanEurope, "Europe"),
    join_region_data(meanAsia, "Asia"),
    join_region_data(meanNorthAmerica, "North America"),
    join_region_data(meanSouthAmerica, "South America"),
    join_region_data(meanOceania, "Oceania")
]

regionDf = pd.concat(frames)
print(regionDf)

In [None]:
modelDataDf = regionDf
uniqueRegions = regionDf['region'].unique()
regionsDictionary = {}
for i in range(len(uniqueRegions)):
    regionsDictionary[uniqueRegions[i]] = i
modelDataDf['region'] = modelDataDf['region'].map(regionsDictionary).astype(int)
# hack to delete the unnamed index column inplace
modelDataDf.drop(modelDataDf.filter(regex="Unname"),axis=1, inplace=True)

# subselect of > defined date
modelDataDf = modelDataDf[pd.to_datetime(modelDataDf['date']) > pd.Timestamp(2020, 8, 30)]
modelDataDf = modelDataDf.drop("date", axis=1)
modelDataDf = modelDataDf.sample(frac=1)

print(modelDataDf)

Train linear regression model

In [None]:
xi = modelDataDf.drop(labels = ['Streams'], axis = 1)
yi = modelDataDf['Streams']
xiTrain, xiTest, yiTrain, yiTest = train_test_split(xi, yi, test_size = 0.40)
modeliLinear = LinearRegression()
modeliLinear.fit(xiTrain, yiTrain)
modeliLinearPrediction = modeliLinear.predict(xiTest)

print('Linear Regression:')
print('Original data mean  = \t' + str(regionDf['Streams'].mean()))
print('Integer model mean = \t' + str(modeliLinearPrediction.mean()))
print('Integer model std = \t' + str(modeliLinearPrediction.std()))
print('R^2 Score = \t' + str(r2_score(yiTest, modeliLinearPrediction)))

Plot linear regression graph

In [None]:
plt.figure(figsize=(12,12))
sns.regplot(modeliLinearPrediction, yiTest, label = 'Numeric', marker = '+')
plt.title('Linear Regression')
plt.ylabel('Streams')
plt.xlabel('Predictions')
plt.legend()
plt.show()

In [None]:
# modeliLinearPrediction = modeliLinear.predict(xiTest)