In [1]:
# watch this: https://www.youtube.com/watch?v=iFTWM7HV2UI
# your cleaned data after you complete this analysis: 
# upload to: https://docs.google.com/forms/d/e/1FAIpQLScVuy4aSiSSpLfUOxbmutWtcUidfkzPDc-C0J2gy6Y8TXqeHQ/viewform

import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('history_cleaned.csv')
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'history_cleaned.csv'

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.isna().sum()

In [None]:
df.dropna( inplace=True)

In [None]:
df.dtypes

In [None]:
# time column is string and string date is starting with YYYY, so I can do string compares
sum(df['Time'] < '1900') 

In [None]:
# I filter and get a copy, so I will not work on a slice, this will be a new dataframe. (essentially losing 1900 data points)
df = df[df.Time > '1900'].copy() 

In [None]:
df['Time'] = pd.to_datetime(df['Time'])

In [None]:
# df['domain'] = df['domain'].astype(str)

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df['domain'].value_counts()[:50]

In [None]:
# top 10 sites that I visisted
df['domain'].value_counts()[:10].plot(kind='bar')

In [None]:
hourly_counts = df.groupby(df.Time.dt.hour).domain.size()
hourly_counts

In [None]:
hourly_counts.plot.bar()

In [None]:
df['Day'] = [ d.day_name() for d in df['Time']]
df['DayIndex'] = [ d.dayofweek for d in df['Time']]
df.head()

In [None]:
df['Day'].value_counts()

In [None]:
days_sorted = df['DayIndex'].value_counts().sort_index()
days_sorted

In [None]:
days_sorted.plot.bar()
plt.xticks(days_sorted.index, ['M', 'T', 'W','Th', 'F', 'Sa', 'Su'], rotation=0)

plt.show()

In [None]:
def is_weekend(day): 
    return day in ['Saturday', 'Sunday']


df['isWeekend'] = [ is_weekend(i) for i in df['Day']]
df.head()

In [None]:
weekend = df[ df['isWeekend'] == True ]

weekday = df[ df['isWeekend'] == False ]


In [None]:
weekend.head()

In [None]:
weekday['domain'].value_counts()[:5]

In [None]:
weekend['domain'].value_counts()[:5]

In [None]:
def show_stats(df, color, alpha=1, title='', fig=None):
    hourly_counts = df.groupby(df.Time.dt.hour).domain.size()
    
    # fill in the missing hours
    for h in range(24):
        if h not in hourly_counts:
            hourly_counts[h]=0 # I initially didn't do this and hours were not lining up
            
    if fig != None:
        plt.subplot(1,2, fig)
        
    hourly_counts.sort_index().plot.bar(color=color, alpha=alpha)
    plt.title(title)
    plt.ylim([0, 1300])
    

plt.figure(figsize=(12, 5))
show_stats( weekend, 'red', 1, 'Weekend', 1)
    
show_stats( weekday, 'blue', 0.5, 'Weekday', 2)

plt.show()


In [None]:
# be skeptical about your data!
# am I on computer a lot more on the weekdays 
# or it shows more simply because there are more weekdays 

In [None]:
def show_stats(df, color, alpha=1, title='', fig=None, y_lim = None):
    hourly_counts = df.groupby(df.Time.dt.hour).domain.size()
    
    days_counts = df.Time.dt.date.nunique()
    
    # fill in the missing hours
    for h in range(24):
        if h not in hourly_counts:
            hourly_counts[h]=0 # I initially didn't do this and hours were not lining up
        else: 
            hourly_counts[h] = hourly_counts[h] * 100.0 /  days_counts # I multiplied by 100.0 to make it float and also not to loose decimals
           
    if fig != None:
        plt.subplot(1,2, fig)
        
    hourly_counts.sort_index().plot.bar(color=color, alpha=alpha, title=title)
    
    if y_lim != None:
        plt.ylim(y_lim)


plt.figure(figsize=(12, 5))

y_lim=[0, 1800]
show_stats(weekend, 'red', 1, 'Weekend',fig=1, y_lim=y_lim)
    
show_stats(weekday, 'blue', 0.5, 'Weekday',fig=2, y_lim=y_lim)

plt.show()

In [None]:
df['domain'].value_counts()[:10]

In [None]:
def get_data_for_domain(val):
    return df[df['domain'].str.lower().str.contains(val.lower(), regex=True)]

# get_data_for_domain('youtube')

In [None]:
def show_domain_stats(domain, color='blue', alpha=1):
    data = get_data_for_domain(domain)
    show_stats(data, color, alpha)
    return data

In [None]:
_= show_domain_stats('redfin', 'blue')

In [None]:
_= show_domain_stats('redfin|zillow|homesnap', 'blue')

In [None]:
_ = show_domain_stats('stackover', 'magenta')


In [None]:
_ = show_domain_stats('disney', 'red')

In [None]:
_ = show_domain_stats('netflix', 'red')

In [None]:
_ = show_domain_stats('netflix|disney|prime', 'red')

In [None]:
_=show_domain_stats('amazon', 'black')

### Based on data, what can we tell about this person?

1. Is this a work computer/personal computer?
1. Is he employed?
1. What is his interests?
1. Anything interesting? 

answer here