# Cleaning data

In [295]:
#Packages
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [296]:
#Load csv fil og lav til panda dataframe
df_org = pd.read_csv("GoMore_Data_42443.csv", sep=',', low_memory=False)
df_org.head(4)
#Should be True
len(df_org) == 42443

True

In [297]:
#Without duplicates
df_drop = df_org[df_org.duplicated(['id'], keep=False)]
#If this length = 0 we have 0 duplicates on the id variable, as we expect.
len(df_drop)

0

In [298]:
#Make dummy variable of whether departure in Denmark or not
df_org['Denmark'] = (df_org['latitude_dep'] > 54.564167) & (df_org['latitude_dep'] < 57.748611) & (df_org['longitude_dep'] > 8.075000) & (df_org['longitude_dep'] < 12.790278).astype(int)
df_org.keys()

Index(['id', 'depart_at', 'seats', 'prefs', 'notes', 'free_seats', 'user_id',
       'handle_fee_rate', 'handle_fee_ceiling_factor', 'handle_fee_maximum',
       'accept_cash', 'accept_online_payment', 'quick_booking', 'flex_booking',
       'price', 'car_id', 'currency_id', 'distance', 'detour_preference',
       'overview_polyline', 'duration', 'waypoints_attributes', 'chat', 'kids',
       'music', 'animals', 'comfort', 'smoking', 'id_arr', 'latitude_arr',
       'longitude_arr', 'name_arr', 'id_dep', 'latitude_dep', 'longitude_dep',
       'name_dep', 'Denmark'],
      dtype='object')

The 40.000 observations is reduced to around 13.000 observations. We limit
the dataset to Denmark (and a bit of Sweden and Germany). We keep the observations where departure and arrival is in Denmark, measured by the longitude and latitude.

In [299]:
denmark_df = df_org[((df_org['latitude_dep'] > 54.564167) & (df_org['latitude_dep'] < 57.748611)) & ((df_org['longitude_dep'] > 8.075000) & (df_org['longitude_dep'] < 12.790278))] 
denmark_df = denmark_df[((denmark_df['latitude_dep'] > 54.564167) & (denmark_df['latitude_dep'] < 57.748611)) & ((denmark_df['longitude_dep'] > 8.075000) & (denmark_df['longitude_dep'] < 12.790278)) & ((denmark_df['latitude_arr'] > 54.564167) & (denmark_df['latitude_arr'] < 57.748611)) & ((denmark_df['longitude_arr'] > 8.075000) & (denmark_df['longitude_arr'] < 12.790278))]
df = denmark_df
#Should be True
len(df) == 13446

True

In [300]:
#Reset index
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,depart_at,seats,prefs,notes,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,...,smoking,id_arr,latitude_arr,longitude_arr,name_arr,id_dep,latitude_dep,longitude_dep,name_dep,Denmark
0,7385002,2018-08-07T15:30:00+02:00,2,"{'chat': '0', 'kids': '1', 'music': '1', 'anim...",Opsamling Rødovre station. Bilen er lille. Ikk...,0,269889,12.5,100,,...,0,25834238,56.025167,9.896857,Smukfest,25834237,55.673514,12.465726,Rødovre,True
1,7385006,2018-08-07T14:00:00+02:00,3,"{'chat': '0', 'kids': '0', 'music': '0', 'anim...",,3,1399528,12.5,100,,...,0,25829883,55.676097,12.568337,København,25829881,56.460584,10.036539,Randers,True
2,7385007,2018-08-09T08:30:00+02:00,1,"{'chat': '0', 'kids': '0', 'music': '0', 'anim...",Afgang fra Vesterbro Brandstation. Tidspunkt k...,0,1438456,12.5,100,,...,0,25829865,55.844405,9.237394,Give,25829864,55.657333,12.538118,Enghavevej 168,True
3,7385010,2018-08-07T06:45:00+02:00,3,"{'chat': '0', 'kids': '1', 'music': '1', 'anim...",,3,906186,12.5,100,,...,0,25829874,56.959168,8.703492,Thisted,25829871,56.48493,8.589933,Struer,True
4,7385011,2018-08-10T16:15:00+02:00,2,"{'chat': '0', 'kids': '1', 'music': '1', 'anim...",,1,2318925,12.5,100,,...,0,25829876,55.713441,9.528237,Nyboesgade,25829875,56.188259,10.184893,Olof Palmes Allé,True


In [301]:
#Drop the variables 'prefs', 'notes' 'overview_polyline' and 'waypoint_Attributes', since thay are already
#unpackes, or consists of private informations.
print(df.keys())
df = df.drop(['prefs', 'notes', 'overview_polyline', 'waypoints_attributes'], 1)
print(df.keys())

Index(['id', 'depart_at', 'seats', 'prefs', 'notes', 'free_seats', 'user_id',
       'handle_fee_rate', 'handle_fee_ceiling_factor', 'handle_fee_maximum',
       'accept_cash', 'accept_online_payment', 'quick_booking', 'flex_booking',
       'price', 'car_id', 'currency_id', 'distance', 'detour_preference',
       'overview_polyline', 'duration', 'waypoints_attributes', 'chat', 'kids',
       'music', 'animals', 'comfort', 'smoking', 'id_arr', 'latitude_arr',
       'longitude_arr', 'name_arr', 'id_dep', 'latitude_dep', 'longitude_dep',
       'name_dep', 'Denmark'],
      dtype='object')
Index(['id', 'depart_at', 'seats', 'free_seats', 'user_id', 'handle_fee_rate',
       'handle_fee_ceiling_factor', 'handle_fee_maximum', 'accept_cash',
       'accept_online_payment', 'quick_booking', 'flex_booking', 'price',
       'car_id', 'currency_id', 'distance', 'detour_preference', 'duration',
       'chat', 'kids', 'music', 'animals', 'comfort', 'smoking', 'id_arr',
       'latitude_arr', 'lo

In [302]:
#Rename datetime column
df = df.rename(columns={'depart_at': 'depart_date_time'})
df.head(3)

Unnamed: 0,id,depart_date_time,seats,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,accept_cash,accept_online_payment,...,smoking,id_arr,latitude_arr,longitude_arr,name_arr,id_dep,latitude_dep,longitude_dep,name_dep,Denmark
0,7385002,2018-08-07T15:30:00+02:00,2,0,269889,12.5,100,,False,True,...,0,25834238,56.025167,9.896857,Smukfest,25834237,55.673514,12.465726,Rødovre,True
1,7385006,2018-08-07T14:00:00+02:00,3,3,1399528,12.5,100,,False,True,...,0,25829883,55.676097,12.568337,København,25829881,56.460584,10.036539,Randers,True
2,7385007,2018-08-09T08:30:00+02:00,1,0,1438456,12.5,100,,False,True,...,0,25829865,55.844405,9.237394,Give,25829864,55.657333,12.538118,Enghavevej 168,True


In [303]:
#Creating the variable 'time' with the correct timezone
time = []
for i in range(0, len(df),1):
    time_ = dt.strptime(df['depart_date_time'].iloc[i][:19], "%Y-%m-%dT%H:%M:%S").time()
    time.append(time_)

df['time'] = time
df.keys()

Index(['id', 'depart_date_time', 'seats', 'free_seats', 'user_id',
       'handle_fee_rate', 'handle_fee_ceiling_factor', 'handle_fee_maximum',
       'accept_cash', 'accept_online_payment', 'quick_booking', 'flex_booking',
       'price', 'car_id', 'currency_id', 'distance', 'detour_preference',
       'duration', 'chat', 'kids', 'music', 'animals', 'comfort', 'smoking',
       'id_arr', 'latitude_arr', 'longitude_arr', 'name_arr', 'id_dep',
       'latitude_dep', 'longitude_dep', 'name_dep', 'Denmark', 'time'],
      dtype='object')

In [304]:
#creating variables date and weekday, month and year
df['date']= pd.to_datetime(pd.Series(df['depart_date_time']).apply(str)).dt.date
df['weekday']= pd.to_datetime(pd.Series(df['depart_date_time']).apply(str)).dt.dayofweek

df['year'] = [str(i)[:4] for i in df['date']]
df['year'] = [int(float(i)) for i in df['year']]
df['year']
df['month'] = [str(i)[5:7] for i in df['date']]
df['month'] = [int(float(i)) for i in df['month']]
df['month']
df.keys()

Index(['id', 'depart_date_time', 'seats', 'free_seats', 'user_id',
       'handle_fee_rate', 'handle_fee_ceiling_factor', 'handle_fee_maximum',
       'accept_cash', 'accept_online_payment', 'quick_booking', 'flex_booking',
       'price', 'car_id', 'currency_id', 'distance', 'detour_preference',
       'duration', 'chat', 'kids', 'music', 'animals', 'comfort', 'smoking',
       'id_arr', 'latitude_arr', 'longitude_arr', 'name_arr', 'id_dep',
       'latitude_dep', 'longitude_dep', 'name_dep', 'Denmark', 'time', 'date',
       'weekday', 'year', 'month'],
      dtype='object')

In [305]:
#Make dummy variable telling if the departure is from Sjælland (1) or not (0)
df['SJ_dep'] = (df['latitude_dep'] > 54.564167) & (df['latitude_dep'] < 57.748611) & (df['longitude_dep'] > 10.958935) & (df['longitude_dep'] < 12.790278)

#Make dummy variable of whether the arrival is at Sjælland (1) or not (0)
df['SJ_arr'] = (df['latitude_arr'] > 54.564167) & (df['latitude_arr'] < 57.748611) & (df['longitude_arr'] > 10.958935) & (df['longitude_arr'] < 12.790278)


#Function taking in True og False, returning 1 or 0
def to_bool(s):
    return 1 if s == True else 0

#Converts the boolean for SJ_dep from True/False to 1/0
SJ_dep_int = []
for i in range(0,len(df),1):
    SJ_dep_int.append(to_bool(df.iloc[i]['SJ_dep']))
    
df['SJ_dep_int'] = SJ_dep_int

#Converts the boolean for SJ_arr from True/False to 1/0
SJ_arr_int = []
for i in range(0,len(df),1):
    SJ_arr_int.append(to_bool(df.iloc[i]['SJ_arr']))
    
df['SJ_arr_int'] = SJ_arr_int

df.head()

Unnamed: 0,id,depart_date_time,seats,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,accept_cash,accept_online_payment,...,Denmark,time,date,weekday,year,month,SJ_dep,SJ_arr,SJ_dep_int,SJ_arr_int
0,7385002,2018-08-07T15:30:00+02:00,2,0,269889,12.5,100,,False,True,...,True,15:30:00,2018-08-07,1,2018,8,True,False,1,0
1,7385006,2018-08-07T14:00:00+02:00,3,3,1399528,12.5,100,,False,True,...,True,14:00:00,2018-08-07,1,2018,8,False,True,0,1
2,7385007,2018-08-09T08:30:00+02:00,1,0,1438456,12.5,100,,False,True,...,True,08:30:00,2018-08-09,3,2018,8,True,False,1,0
3,7385010,2018-08-07T06:45:00+02:00,3,3,906186,12.5,100,,False,True,...,True,06:45:00,2018-08-07,1,2018,8,False,False,0,0
4,7385011,2018-08-10T16:15:00+02:00,2,1,2318925,12.5,100,,False,True,...,True,16:15:00,2018-08-10,4,2018,8,False,False,0,0


In [306]:
# accept_cash to boolean with integer
accept_cash = []
for i in range(0,len(df),1):
    accept_cash.append(to_bool(df.iloc[i]['accept_cash']))
    
df['accept_cash_int'] = accept_cash


In [307]:
#accept online payment to boolean with integer
accept_online_payment = []
for i in range(0,len(df),1):
    accept_online_payment.append(to_bool(df.iloc[i]['accept_online_payment']))
    
df['accept_online_payment_int'] = accept_online_payment

In [308]:
#quick_booking
quick_booking = []
for i in range(0,len(df),1):
    quick_booking.append(to_bool(df.iloc[i]['quick_booking']))
    
df['quick_booking_int'] = quick_booking

In [309]:
#flex_booking
flex_booking = []
for i in range(0,len(df),1):
    flex_booking.append(to_bool(df.iloc[i]['flex_booking']))
    
df['flex_booking_int'] = flex_booking

In [310]:
#Make the categorical variable, to dummies, which can be used i the model
dummies = pd.get_dummies(df.detour_preference)
df['det_15_minutes'] = dummies.iloc[:,0]
df['det_5_minutes'] = dummies.iloc[:,1]
df['det_flexible'] = dummies.iloc[:,2]
df['det_none'] = dummies.iloc[:,3]
df.head()

Unnamed: 0,id,depart_date_time,seats,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,accept_cash,accept_online_payment,...,SJ_dep_int,SJ_arr_int,accept_cash_int,accept_online_payment_int,quick_booking_int,flex_booking_int,det_15_minutes,det_5_minutes,det_flexible,det_none
0,7385002,2018-08-07T15:30:00+02:00,2,0,269889,12.5,100,,False,True,...,1,0,0,1,1,1,0,1,0,0
1,7385006,2018-08-07T14:00:00+02:00,3,3,1399528,12.5,100,,False,True,...,0,1,0,1,1,1,0,0,1,0
2,7385007,2018-08-09T08:30:00+02:00,1,0,1438456,12.5,100,,False,True,...,1,0,0,1,1,1,0,1,0,0
3,7385010,2018-08-07T06:45:00+02:00,3,3,906186,12.5,100,,False,True,...,0,0,0,1,1,1,0,1,0,0
4,7385011,2018-08-10T16:15:00+02:00,2,1,2318925,12.5,100,,False,True,...,0,0,0,1,0,1,0,1,0,0


In [311]:
#Convert price to DKK
df['price_DKK'] = df['price']/100
#Comvert distance to KM
df['distance_KM'] = df['distance']/1000
#New variable - Price/distance
df['DKK_per_KM'] = df['price_DKK']*1/df['distance_KM']*1
#New variable - taken_seats
df['taken_seats'] = df['seats'] - df['free_seats']
df.head()

Unnamed: 0,id,depart_date_time,seats,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,accept_cash,accept_online_payment,...,quick_booking_int,flex_booking_int,det_15_minutes,det_5_minutes,det_flexible,det_none,price_DKK,distance_KM,DKK_per_KM,taken_seats
0,7385002,2018-08-07T15:30:00+02:00,2,0,269889,12.5,100,,False,True,...,1,1,0,1,0,0,185.0,271.775,0.68071,2
1,7385006,2018-08-07T14:00:00+02:00,3,3,1399528,12.5,100,,False,True,...,1,1,0,0,1,0,180.0,343.083,0.524654,0
2,7385007,2018-08-09T08:30:00+02:00,1,0,1438456,12.5,100,,False,True,...,1,1,0,1,0,0,150.0,268.189,0.559307,1
3,7385010,2018-08-07T06:45:00+02:00,3,3,906186,12.5,100,,False,True,...,1,1,0,1,0,0,35.0,73.159,0.47841,0
4,7385011,2018-08-10T16:15:00+02:00,2,1,2318925,12.5,100,,False,True,...,0,1,0,1,0,0,55.0,74.306,0.740182,1


In [312]:
#Creating a dummy variable, telling if the tour crosses Storebælt (1) (toll_road) or not (0)
df['toll_road'] = (df['SJ_dep_int']) + (df['SJ_arr_int'])

toll_road = []
for i in range(0, len(df)):
    if (df['toll_road'][i]) == 1:
        toll_road.append(1)
    else:
        toll_road.append(0)

df['toll_road_bool'] = toll_road
df.head(10)

Unnamed: 0,id,depart_date_time,seats,free_seats,user_id,handle_fee_rate,handle_fee_ceiling_factor,handle_fee_maximum,accept_cash,accept_online_payment,...,det_15_minutes,det_5_minutes,det_flexible,det_none,price_DKK,distance_KM,DKK_per_KM,taken_seats,toll_road,toll_road_bool
0,7385002,2018-08-07T15:30:00+02:00,2,0,269889,12.5,100,,False,True,...,0,1,0,0,185.0,271.775,0.68071,2,1,1
1,7385006,2018-08-07T14:00:00+02:00,3,3,1399528,12.5,100,,False,True,...,0,0,1,0,180.0,343.083,0.524654,0,1,1
2,7385007,2018-08-09T08:30:00+02:00,1,0,1438456,12.5,100,,False,True,...,0,1,0,0,150.0,268.189,0.559307,1,1,1
3,7385010,2018-08-07T06:45:00+02:00,3,3,906186,12.5,100,,False,True,...,0,1,0,0,35.0,73.159,0.47841,0,0,0
4,7385011,2018-08-10T16:15:00+02:00,2,1,2318925,12.5,100,,False,True,...,0,1,0,0,55.0,74.306,0.740182,1,0,0
5,7385035,2018-08-06T20:30:00+02:00,3,3,734681,12.5,100,,False,True,...,0,0,1,0,80.0,172.158,0.464689,0,0,0
6,7385039,2018-08-06T19:45:00+02:00,3,3,2068395,12.5,100,,False,True,...,1,0,0,0,80.0,66.984,1.194315,0,0,0
7,7385040,2018-08-07T09:15:00+02:00,3,3,2115457,12.5,100,,False,True,...,0,1,0,0,50.0,94.245,0.530532,0,2,0
8,7385041,2018-08-08T09:15:00+02:00,3,3,2115457,12.5,100,,False,True,...,0,1,0,0,50.0,94.245,0.530532,0,2,0
9,7385042,2018-08-09T09:15:00+02:00,3,3,2115457,12.5,100,,False,True,...,0,1,0,0,50.0,94.245,0.530532,0,2,0


In [313]:
#New variable - making time into a catgorical variable
df['time_hour'] = pd.to_datetime(pd.Series(df['time']).apply(str)).dt.hour
df['time_minute'] = pd.to_datetime(pd.Series(df['time']).apply(str)).dt.minute
df['time_minute_hour'] = df['time_hour']*1 + df['time_minute']/60

#Creating new variable. Split the time into categorical values
df['time_cat'] = pd.cut(df.time_minute_hour, [0,6,9,12,15,18,21,24] , 
                        labels=["Night", "Morning", "Forenoon", "Noon", "Afternoon","Evening", "Late_evening"], 
                        include_lowest=True)

#Creating dummies, so the categorical variables can be included in the model
dummies = pd.get_dummies(df.time_cat)
df = pd.concat([df, dummies], axis=1)


In [314]:
#Remove outliers with unrealistic high or unrealistic low distance (removing 6 observations)

#See the outliers here:
print(df\
    .groupby('distance_KM')\
    ['id']\
    .count())

#Define the outliers:
outlier = (df['distance'] > 1500000) | (df['distance'] ==0.0)
print(df[outlier]['name_dep'])
print(df[outlier]['name_arr'])

#Drop outliers
df = df.drop(df[outlier == True].index)

#Should be true
len(df) == 13440

distance_KM
0.000        4
4.941        1
5.817        1
6.336        1
7.493        1
8.314        1
8.335        2
8.972       45
9.324        1
10.346       1
10.409       1
10.592       1
11.565       2
11.881       1
12.241       1
12.361       7
12.687       6
13.690       1
13.890       1
14.255       1
15.076       1
15.660       7
15.666       1
15.981       1
16.220       1
17.378       1
17.860       1
18.035       1
18.045       1
18.302       3
            ..
481.881      1
482.416      1
483.776      1
486.512      1
491.393      1
491.949      1
493.539      1
494.282      1
494.794      1
504.639      1
505.197      1
508.013      1
512.422      1
521.592      1
529.942      1
536.813      1
536.891      1
541.234      1
611.000      1
615.715      1
616.554      1
617.714      1
661.827      1
661.898      1
692.025      1
779.928      1
850.099      1
924.285      1
1989.218     1
2375.380     1
Name: id, Length: 6830, dtype: int64
3980                    Skibby
4322 

True

In [315]:
#Checking handle_fee_ceiling_factor - only 7 observations does not have the value 100
df_handle = df['handle_fee_ceiling_factor']!=100
df[df_handle]

print(df\
    .groupby('handle_fee_ceiling_factor')\
    ['id']\
    .count())

#Checking handle_fee_rate - only 7 observations does not have the value 12.5 ( the same 7 observations as before)
df_handle_fee = df['handle_fee_rate']!=12.5
df[df_handle_fee] == df[df_handle]

print(df\
    .groupby('handle_fee_rate')\
    ['id']\
    .count())


handle_fee_ceiling_factor
10         7
100    13433
Name: id, dtype: int64
handle_fee_rate
0.0         4
10.0        3
12.5    13433
Name: id, dtype: int64


**Variables to be removed and why:**

In [316]:
#Handle_fee_maximum consists of 13441 missing observations, so we delete this variable
print(df\
    .groupby('handle_fee_maximum')\
    ['id']\
    .count())

handle_fee_maximum
100.0    3
Name: id, dtype: int64


In [317]:
df = df.drop(['handle_fee_maximum'], 1)

In [318]:
#'Chat' is 0 for all observations, so we delete this variable
print(df\
    .groupby('chat')\
    ['id']\
    .count())

chat
0    13440
Name: id, dtype: int64


In [319]:
df = df.drop(['chat'], 1)

**Inspecting the different values of the variables:**

In [320]:
#The drivers offers between 1 and 10 seats during a ride
print(df\
    .groupby('seats')\
    ['id']\
    .count())

seats
1      926
2     2500
3     8588
4     1189
5      168
6       17
7       36
8       13
9        2
10       1
Name: id, dtype: int64


In [321]:
#38 out of 13440 rides accept cash:
print(df\
    .groupby('accept_cash')\
    ['id']\
    .count() )

# 13402 rides out of 13440 accept online payment, that i 38 rides where online payment isn't accepted:
print(df\
    .groupby('accept_online_payment')\
    ['id']\
    .count() )
#Investigate if these 38 are the same (id)

accept_cash
False    13402
True        38
Name: id, dtype: int64
accept_online_payment
False       38
True     13402
Name: id, dtype: int64


In [322]:
df.keys()

Index(['id', 'depart_date_time', 'seats', 'free_seats', 'user_id',
       'handle_fee_rate', 'handle_fee_ceiling_factor', 'accept_cash',
       'accept_online_payment', 'quick_booking', 'flex_booking', 'price',
       'car_id', 'currency_id', 'distance', 'detour_preference', 'duration',
       'kids', 'music', 'animals', 'comfort', 'smoking', 'id_arr',
       'latitude_arr', 'longitude_arr', 'name_arr', 'id_dep', 'latitude_dep',
       'longitude_dep', 'name_dep', 'Denmark', 'time', 'date', 'weekday',
       'year', 'month', 'SJ_dep', 'SJ_arr', 'SJ_dep_int', 'SJ_arr_int',
       'accept_cash_int', 'accept_online_payment_int', 'quick_booking_int',
       'flex_booking_int', 'det_15_minutes', 'det_5_minutes', 'det_flexible',
       'det_none', 'price_DKK', 'distance_KM', 'DKK_per_KM', 'taken_seats',
       'toll_road', 'toll_road_bool', 'time_hour', 'time_minute',
       'time_minute_hour', 'time_cat', 'Night', 'Morning', 'Forenoon', 'Noon',
       'Afternoon', 'Evening', 'Late_evening']

In [323]:
#Drop all unecessary variables: 'distance', 'price', 'Denmark'
df = df.drop(['distance', 'price', 'Denmark'], 1)

In [324]:
#Since we have 2809 missing observations in car_id, we delete this variable
#The 39 missing observations in the variable flex_booking, force us to delete 39 out of 13440 observations - doesn't affect the data much
df.isnull().sum()

id                              0
depart_date_time                0
seats                           0
free_seats                      0
user_id                         0
handle_fee_rate                 0
handle_fee_ceiling_factor       0
accept_cash                     0
accept_online_payment           0
quick_booking                   0
flex_booking                   39
car_id                       2809
currency_id                     0
detour_preference               0
duration                        0
kids                            0
music                           0
animals                         0
comfort                         0
smoking                         0
id_arr                          0
latitude_arr                    0
longitude_arr                   0
name_arr                        0
id_dep                          0
latitude_dep                    0
longitude_dep                   0
name_dep                        0
time                            0
date          

In [325]:
#Drop the variable 'car_id'
df = df.drop(['car_id'], 1)

In [326]:
#Drop observations with Nan
df = df.dropna(0)

In [327]:
#Know we have 0 Nan
df.isnull().sum().sum()

0

In [329]:
#Save the file, and delete the old file in Github before uploading this one (same name)
df.to_csv('GoMore_Data_DK_270818.csv', index=False)