In [8]:
import pandas as pd
import urllib
from urllib.parse import urlparse
import httplib2 as http
import json

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

%load_ext autoreload
%autoreload 2

from my_functions import DataGathering
from my_functions import EDA
from my_functions import Model

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Getting Data

In [None]:
# Create an empty dataframe to store the data
df = pd.DataFrame(columns = [1,2,3], index =[1,2,3])

## APIs

In [14]:
api_file_path = 'api_key.txt'
uri =  'https://api/link/here'
path = '/extra/link/here' # optional field


In [None]:
# explore the data gotten from the API
useful_object_name.df.head()

In [None]:
# always export the data to a file locally if possible in case the kernel crashes

# file name
csv_file_name = 'useful_file_name.csv'

# export data to csv
useful_object_name.df.to_csv(csv_file_name)

## Webscraping

# Importing Data

## CSV Files

In [None]:
# file path
csv = 'local/data/useful_file_name.csv'

# import data to csv
df = pd.read_csv(csv)

# save the df to an object
useful_object_name = DataGathering(df)

## GEOJSON Files

In [None]:
# import geojson file
filepath = 'local/data/geosojon_file.json'
with open(filepath) as f:
    data = json.load(f)

# get an overview of the keys
print(f'They keys of the data are {data.keys()}')

# get an overview of the values
print(f'The values of the data are {data.values()}')

for key, value in data.items():
    print(key, value)

In [None]:
# create a dataframe to store the values
df = pd.DataFrame(columns = ['latitude','longitude'])
useful_object_name = DataGathering(df)
df['latitude'] = data['latitude']
df['longitude'] = data['longitude']

# EDA & Feature Engineering

In [None]:
print(f'The dataframe has a shape of {df.shape}')

In [None]:
# look at the first 5 rows of the dataframe
df.head()

In [None]:
# get dtypes of the data
df.info()

In [None]:
# check for null values
df.isnull().sum()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# check for unique values
df.nunique()

In [None]:
# drop useless columns
df.drop(columns = ['useless_column'], inplace = True)

## Correcting Column Types

In [None]:
# convert column to correct data type
df['column']=df['column'].astype('int')
df['column']=df['column'].astype('flt')
df['column']=df['column'].astype('str')

# convert column to datetime
pd.to_datetime(df)

In [None]:
# Checking the column types are rectified
df.info()

## EDA Continued

In [None]:
df.describe()

### Scatter Plot

In [None]:
# plot latitude and longitude on a scatter plot to visualize
plt.scatter(df['Longitude'],df['Latitude'])
plt.title('Bus Stops in Singapore')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

### Pairplot

### Correlation Matrix

### Boxplot

### Histogram

# Modeling

In [None]:
# dummify categorical variables
df = pd.get_dummies(df, columns = ['column_name'], drop_first = True)