In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import cufflinks as cf
from IPython.display import HTML
import seaborn as sns

import colorlover as cl
from IPython.display import HTML

In [None]:
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

In [None]:
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
%matplotlib inline
def cmocean_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []

    for k in range(pl_entries):
        C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

    return pl_colorscale

def cost_of_service_means(df_in):
    return df_in.groupby('cost_of_service')['actual_download','actual_upload','advertised_download','advertised_upload'].mean()

# Here's What "Up to 1000MBPS" Actually Means in Seattle

In [None]:
speedReport = pd.read_csv('../input/seattle-broadband-speed-test/broadband-speed-test.csv')

In [None]:
speed_df = pd.DataFrame(speedReport)

In [None]:
speed_df.head(10)

### The Dataset
So we've got 
- **id**: unique identifier
- **actual_download**: recorded download speed
- **actual_upload**: recorded upload speed
- **advertised_download**: user reported download speed they are paying for
- **advertised_upload**: user reported upload speed they are paying for
- **connection_type**: Type of connection to internet (wired/wireless and single user/multiple users)
- **cost_of_service**: user reported cost of isp service
- **date_pretty**: formatted date and time of test
- **isp**: isp company
- **isp_user**: Uncertain
- **min_rtt**: latency from user to test server
- **timestamp**: Unix timestamp
- **seattle_blkgrpce10**: No idea what this is
    
in this dataset. 
## Let's start exploring

In [None]:
speed_df.max()

In [None]:
speed_df.min()

In [None]:
speed_df.info()

In [None]:
speed_df.describe()

In [None]:
speed_df.sort_values('actual_download', ascending=False)

In [None]:
speed_df.sort_values('actual_upload', ascending=False)

In [None]:
speed_df.sort_values('advertised_download', ascending=False)

### At a glance, there appears to be a couple of outliers skewing the data and some null values. We'll likely have to clean this data at a further point. But for now, the dataset is clean enough to at least answer a few questions.

## A. What is the average user paying?

Let's start by getting a count of all the users in each price tier:

In [None]:
speed_df.groupby('cost_of_service').count()

In [None]:
def __reindex__(df_in):
    df_in = df_in.reindex(index = ['100_or_above','75_100','50_75','25_50','less_than_25','dont_know'])
    return df_in

customerDistribution = speed_df.groupby('cost_of_service')['id'].count()
customerDistribution = __reindex__(customerDistribution)
customerDistribution.iplot(kind='bar', xTitle='Monthly cost of Internet Service', yTitle='Number of customers', title='Distribution of internet service subscribers in Seattle')

> __The majority of internet users in Seattle pay 50-75 USD a month for internet service.__
    
It appears as though *over half of the data points are missing advertised upload/download speed values*. Let's try and fill these values in by inferring what we can from the available information.
    
### First, let's limit our search to just the top 5 most popular ISPs.

In [None]:
speed_df.groupby('isp').count().sort_values('id', ascending=False)

If we assume that these datapoints are representative of isp usage in Seattle, then it would appear that the most popular ISPs in Seattle are 
1. __Comcast__ with 1967 users in this study
2. __CenturyLink__ with 920 users in this study
3. __Spectrum__  with 232 users in this study

Since I'm mostly interested in how larger ISPs are doing (since they affect the most people), let's just omit the smaller ISPs.
    
Also let's remove the 0 and null values for actual_download as these people are likely having connection issues which may be outside the control of the ISP. 
    
    Let's create a frame for each isp so we can compare them later on.

In [None]:
speed_df = speed_df[pd.notnull(speed_df['actual_download'])]
speed_df = speed_df[speed_df['actual_download'] != 0]

In [None]:
comcast_users = speed_df.query('isp == "comcast"')
centurylink_users = speed_df.query('isp == "centurylink"')
spectrum_users = speed_df.query('isp == "AS11404 vanoppen.biz LLC"')
wave_users = speed_df.query('isp == "wave"')
frontier_users = speed_df.query('isp == "frontier"')

    Now let's make a DF that includes all of the relevant companies

In [None]:
top5Isp_providers = pd.concat([comcast_users,centurylink_users,spectrum_users,wave_users,frontier_users])

In [None]:
top5Isp_providers.sort_values(by=['advertised_upload'],ascending=False)

In [None]:
cost_of_service_means(top5Isp_providers)

Since we're trying to compare advertised speeds to actual speeds, we can ignore all the columns except 
- "actual_download" 
- "actual_upload" 
- "advertised_download" 
- "advertised_upload".

Hmm, these don't look right. Let's check our data set for outliers to verify that the 100 or above advertised_download speeds actually are 100x greater than the speeds advertised in the 25-50 category and not the result of a couple outliers skewing the means. 

The less than 25 should also be the lowest value in advertised_download.
I also suspect the null values are messing up the means, so I'm going to have to fill in null values first.

In [None]:
top5Isp_providers.groupby('cost_of_service').min()

    Yep, there are definitely null and or 0 values in the advertised_download column, so we're going to have to remove them from our set for now.
        0 Values indicate connection issues and null values are not really very useful in this study as we're trying to compare advertised to actual speeds.
## Cleaning the data

In [None]:
top5Isp_providers['timestamp'] = pd.to_datetime(top5Isp_providers.timestamp, unit='s')

First let's convert the timestamp into the datatime object for ease of use later on.

Next let's take care of the null values and 0 values by just dropping them from the study.

In [None]:
advertisedOnly = top5Isp_providers[pd.notnull(top5Isp_providers['advertised_download'])]

In [None]:
top5Isp_providers = advertisedOnly[advertisedOnly.advertised_download > 0]

Okay now let's try taking the averages

In [None]:
cost_of_service_means(top5Isp_providers)

    It looks like a few of the values changed but not enough to make a big difference. If anything it's skewed the "less_than_25" bracket a little more than before. Let's deal with that outlier we found earlier.

In [None]:
top5Isp_providers[top5Isp_providers['cost_of_service'] == 'less_than_25'].sort_values('advertised_download', ascending=False)

    There it is. id 3637 is claiming comcast is offering him 1GBPS, which is clearly one of the most expensive speed packages you can get, especially with comcast. Perhaps he was using the wrong units. I'll just drop it from our observations.

In [None]:
top5Isp_providers.drop([3354], inplace=True)

In [None]:
speedsByCost = cost_of_service_means(top5Isp_providers)
speedsByCost = __reindex__(speedsByCost)
speedsByCost

Okay so the less_than_25 category is looking a lot better now. 
    
## Now that our data's a bit cleaner, let's take another look

In [None]:
speedsByCost.iplot(kind='bar',yTitle='Speed in mbps',xTitle='Price',title='Average measured speed vs Avg reported advertised speed (all ISPs)')

In [None]:
top5Isp_providers.iplot(kind='scatter', mode='markers',yTitle='Speed in mbps', x='cost_of_service', xTitle='Monthly Price of Service', y='actual_download', title='Actual Download Speeds vs Monthly Price of Service')
top5Isp_providers

Ok these are some pretty disparate numbers here. At a glance, it appears that on average people get about **only 30-50% of the max advertised speed** (at the time of the speed test).
However I've noticed that there is a lot of inaccurate reports in the advertised speeds category so to address this, let's find actual pricing information, the mean speed in every price category, and use a combination of ISP and price paid to determine a accurate "advertised_download" and "advertised_upload" value.
## Self reporting is not reliable, let's try to normalize what we can.

In [None]:
pricingInfo = pd.read_csv('../input/isp-pricing/PricingInfo.csv')
pricingInfo = pricingInfo.rename(index=str, columns={'Unnamed: 0': 'Price Category'})
pricingInfo.head(5)

*As of 7/15/2018, this is the current pricing info for each ISP, categorized via the ranges used in this study*
    
Some of these values were assigned via the assumptions listed below:
    
- Comcast - no internet package in the 75-100 bracket, values in this category likley includes other services, so we'll use 50-75 category speed values
- Cenutury link - no internet package was listed in the 100+ bracket, all values above likely include other services, so will just use the 75-100 category speed value.
- Spectrum - No internet package in the 50-75 bracket, will just use the 25_50 value as values in the 50-75 bracket will likely include other services.
    
    **Wave and Frontier internet pricing information was not available for my neighborhood (north seattle) so I've decided to just drop them from the study.**


In [None]:
def __normalizePricing__(df_in, t1price, t2price, t3price, t4price, t5price):
    df_in.loc[df_in.cost_of_service == 'less_than_25', 'advertised_download'] = t1price
    df_in.loc[df_in.cost_of_service == '25_50', 'advertised_download'] = t2price
    df_in.loc[df_in.cost_of_service == '50_75', 'advertised_download'] = t3price
    df_in.loc[df_in.cost_of_service == '75_100', 'advertised_download'] = t4price
    df_in.loc[df_in.cost_of_service == '100_or_above', 'advertised_download'] = t5price
    return df_in

In [None]:
comcast_users = __normalizePricing__(comcast_users,0,37.5,200,700,1500)
centurylink_users = __normalizePricing__(centurylink_users,0,20,90,1000,1000)
spectrum_users = __normalizePricing__(spectrum_users,0,60,60,100,100)

In [None]:
top3Isp_providers = pd.concat([comcast_users,centurylink_users,spectrum_users])
top3Isp_providers['timestamp'] = pd.to_datetime(top3Isp_providers.timestamp, unit='s')
top3Isp_providers.info()

## Let's analyze the data again now that the advertised speeds are normalized

In [None]:
top3Isp_providersMeanCosts = cost_of_service_means(top3Isp_providers)
top3Isp_providersMeanCosts = __reindex__(top3Isp_providersMeanCosts)
top3Isp_providersMeanCosts = top3Isp_providersMeanCosts[top3Isp_providersMeanCosts.actual_download > 0]

top3Isp_providersMeanCosts.iplot(kind='bar',title='Avg. Measured Speed vs Max Speed Advertised (all ISPs)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

In [None]:
comcast_advertisedVsActual = cost_of_service_means(comcast_users)
comcast_advertisedVsActual = __reindex__(comcast_advertisedVsActual)

centurylink_advertisedVsActual = cost_of_service_means(centurylink_users)
centurylink_advertisedVsActual = __reindex__(centurylink_advertisedVsActual)

spectrum_advertisedVsActual = cost_of_service_means(spectrum_users)
spectrum_advertisedVsActual = __reindex__(spectrum_advertisedVsActual)

In [None]:
comcast_users['actual_download'].max()

In [None]:
comcast_advertisedVsActual.iplot(kind='bar',colorscale='YlGnBu',title='Average measured speed vs Max advertised speed (Comcast)',yTitle='speed in mbps',xTitle='Price',barmode='group')

In [None]:
centurylink_users['actual_download'].max()

In [None]:
centurylink_advertisedVsActual.iplot(kind='bar',colorscale='set2',title='Average measured speed vs Max advertised speed (CenturyLink)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

In [None]:
spectrum_users['actual_download'].max()

In [None]:
spectrum_users.sort_values(by=['advertised_upload'],ascending=False)

In [None]:
spectrum_advertisedVsActual.iplot(kind='bar', colorscale='accent',title='Average measured speed vs Max Advertised speed (Spectrum)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

Among the top 3 ISPs in Seattle, the company with the highest AVG. speed appears to be CenturyLink (**178mbps**, 100+USD per mo) 
with Spectrum in second place (**90mbps** 75-100 per mo) and comcast in last (**47mbps**, 100+ per mo). 
    
The highest recorded speed came from CenturyLink (**886mbps**).
    
Spectrum's average speeds were closest to the advertised maximum speed (90mbps out of 100mbps). However it is worth noting that those who paid more than $100 for Spectrum internet got a lower speed overall.

# Impact of connection type on speeds

Wireless connections can be unreliable due to a number of factors outside of the control of internet service providers. Since the data specifies the type of connection the user had at the time of the test, we can deduce the magnitude of impact of connection type on speed at time of test. 

Since we're just trying to determine impact of connection type on test download speeds we'll be using our entire dataset.

In [None]:
speeds_by_connection = speed_df.groupby(' connection_type')['actual_download','actual_upload'].mean()
speeds_by_connection.iplot(kind='bar',title='Average Measured speeds by Connection type',yTitle='Speed in mbps',xTitle='Connection Type')

In [None]:
top3Isp_providers.iplot(kind='scatter', mode='markers',yTitle='Speed in mbps', x=' connection_type', xTitle='Connection Type', y='actual_download', title='Actual Download Speeds vs Connection Type')

So it does seem that connection type is significant in its role in getting the most out of your ISP, but it still doesn't guarentee fast connection speeds. Lets again adjust our study to filter out non-wired connections. This way our results will include the least amount of user-interference.

In [None]:
#top3Isp_providersMeanCosts = top3Isp_providers.groupby('cost_of_service')['actual_download','advertised_download'].mean()
wired_Top3Isp_providers = top3Isp_providers[top3Isp_providers[' connection_type'] == 'wired']
wired_Top3Isp_providers = cost_of_service_means(wired_Top3Isp_providers)
wired_Top3Isp_providers = __reindex__(wired_Top3Isp_providers)

wired_Top3Isp_providers.iplot(kind='bar',colorscale='YlGn',title='Average actual speed vs Advertised speed of Top 3 ISPs (Wired Connections)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

### Unsurprisingly, connection type plays a major role in predicting speeds. 
We saw a increase of 
1. **45mbps** in the 100_or_above category 
2. **23mbps** in the 75_100 category 
3. **10mbps** in the 50_75 category 
4. **5mbps** in the 25_50 
5. **15mbps** in the dont_know category. 

## Now let's see what time of day these tests were being conducted
Intuitively I'd expect some of these tests were conducted because users were facing unusually slow internet speeds. To confirm my suspicion, let's check the data to see if lower speeds correlate with peak usage hours (7pm-11pm).

In [None]:
peakHours = pd.DataFrame(top3Isp_providers)

In [None]:
offpeakHours = pd.DataFrame(top3Isp_providers)

In [None]:
top3Isp_byHour = pd.DataFrame(top3Isp_providers)

In [None]:
(top3Isp_providers['timestamp'].dt.hour).apply(pd.Series).iplot(kind='hist',
                                                                title='Number of Tests by Hour of Day',
                                                                xTitle='Hour of Day',
                                                                yTitle='Number of Tests')

In [None]:
peakHours.set_index('timestamp', inplace=True)

In [None]:
peakHours = peakHours.between_time('19:00','23:00')
offpeakHours = offpeakHours.between_time('23:00','19:00') 

peakHours_Means = __reindex__(peakHours.groupby('cost_of_service')['actual_download','actual_upload'].mean())
peakHours_Means.iplot(kind='bar',title='Average measured speed vs Max advertised speed during Peak Hours (Top 3 ISPs)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

offpeakHours_Means = __reindex__(offpeakHours.groupby('cost_of_service')['actual_download','actual_upload'].mean())
offpeakHours_Means.iplot(kind='bar',title='Average measured speed vs Max advertised speed during Off Peak Hours (Top 3 ISPs)',yTitle='Speed in mbps',xTitle='Price',barmode='group')

In [None]:
top3Isp_byHour

In [None]:
top3Isp_providers.index=top3Isp_providers.index.strftime('%H')

In [None]:
top3Isp_speedsByHour = top3Isp_providers['actual_download'].apply(pd.Series)

In [None]:
df = top3Isp_speedsByHour[0].apply(pd.Series)

In [None]:
df.iplot(kind='scatter',mode='markers',xTitle='Hour of Day',yTitle='Speeds in Mbps',title='Actual Speeds Recorded by Hour of Day')