In [1]:
#importing pandas and os module (os module is for filename directory reading)
import pandas as pd
import os

In [2]:
#defining the path of our sales data directory
path = './SalesAnalysis/Sales_Data'
#show all files in the directory containing sales data files and storing them in a variable
files = os.listdir(path)
#importing only the data of April ('Sales_April_2019.csv' is the first file)
df_april_sales = pd.read_csv(path+'/'+files[0])
df_april_sales

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
1,,,,,,
2,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560,Google Phone,1,600,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560,Wired Headphones,1,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
18378,194090,Google Phone,1,600,04/08/19 17:11,"177 Jackson St, Los Angeles, CA 90001"
18379,194091,AA Batteries (4-pack),1,3.84,04/15/19 16:02,"311 Forest St, Austin, TX 73301"
18380,194092,AAA Batteries (4-pack),2,2.99,04/28/19 14:36,"347 Sunset St, San Francisco, CA 94016"
18381,194093,AA Batteries (4-pack),1,3.84,04/14/19 15:09,"835 Lake St, Portland, OR 97035"


# Cleaning the Data

In [3]:
#converting all numeric rows to the numeric type (since all rows are currently strings)
df_april_sales['Quantity Ordered'] = pd.to_numeric(df_april_sales['Quantity Ordered'],errors = 'coerce')
df_april_sales['Price Each'] = pd.to_numeric(df_april_sales['Price Each'],errors = 'coerce')
#coverting the 'Order Date' row to datetime type
df_april_sales['Order Date'] = pd.to_datetime(df_april_sales['Order Date'],errors = 'coerce')

In [4]:
#checking the data types
df_april_sales.dtypes

Order ID                    object
Product                     object
Quantity Ordered           float64
Price Each                 float64
Order Date          datetime64[ns]
Purchase Address            object
dtype: object

In [5]:
#drop all rows where the column value of 'Quantity Ordered' is 'NaN'
df_april_sales = df_april_sales.dropna(subset=['Quantity Ordered'])
df_april_sales

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2.0,11.95,2019-04-19 08:46:00,"917 1st St, Dallas, TX 75001"
2,176559,Bose SoundSport Headphones,1.0,99.99,2019-04-07 22:30:00,"682 Chestnut St, Boston, MA 02215"
3,176560,Google Phone,1.0,600.00,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
4,176560,Wired Headphones,1.0,11.99,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
5,176561,Wired Headphones,1.0,11.99,2019-04-30 09:27:00,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
18378,194090,Google Phone,1.0,600.00,2019-04-08 17:11:00,"177 Jackson St, Los Angeles, CA 90001"
18379,194091,AA Batteries (4-pack),1.0,3.84,2019-04-15 16:02:00,"311 Forest St, Austin, TX 73301"
18380,194092,AAA Batteries (4-pack),2.0,2.99,2019-04-28 14:36:00,"347 Sunset St, San Francisco, CA 94016"
18381,194093,AA Batteries (4-pack),1.0,3.84,2019-04-14 15:09:00,"835 Lake St, Portland, OR 97035"


In [6]:
#checking for any rows with a null value
df_april_sales[df_april_sales.isna().any(axis=1)]

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [7]:
#reset index
df_april_sales = df_april_sales.reset_index(drop='True')

In [8]:
#viewing the dataframe
df_april_sales

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2.0,11.95,2019-04-19 08:46:00,"917 1st St, Dallas, TX 75001"
1,176559,Bose SoundSport Headphones,1.0,99.99,2019-04-07 22:30:00,"682 Chestnut St, Boston, MA 02215"
2,176560,Google Phone,1.0,600.00,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
3,176560,Wired Headphones,1.0,11.99,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
4,176561,Wired Headphones,1.0,11.99,2019-04-30 09:27:00,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
18284,194090,Google Phone,1.0,600.00,2019-04-08 17:11:00,"177 Jackson St, Los Angeles, CA 90001"
18285,194091,AA Batteries (4-pack),1.0,3.84,2019-04-15 16:02:00,"311 Forest St, Austin, TX 73301"
18286,194092,AAA Batteries (4-pack),2.0,2.99,2019-04-28 14:36:00,"347 Sunset St, San Francisco, CA 94016"
18287,194093,AA Batteries (4-pack),1.0,3.84,2019-04-14 15:09:00,"835 Lake St, Portland, OR 97035"


The data is now clean

In [9]:
#adding a 'Revenue' column(just the multiplication of 'Quantity Ordered' 'Price Each')
df_april_sales.insert(4,'Revenue',df_april_sales['Quantity Ordered']*df_april_sales['Price Each'])

In [10]:
df_april_sales

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Revenue,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2.0,11.95,23.90,2019-04-19 08:46:00,"917 1st St, Dallas, TX 75001"
1,176559,Bose SoundSport Headphones,1.0,99.99,99.99,2019-04-07 22:30:00,"682 Chestnut St, Boston, MA 02215"
2,176560,Google Phone,1.0,600.00,600.00,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
3,176560,Wired Headphones,1.0,11.99,11.99,2019-04-12 14:38:00,"669 Spruce St, Los Angeles, CA 90001"
4,176561,Wired Headphones,1.0,11.99,11.99,2019-04-30 09:27:00,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...,...
18284,194090,Google Phone,1.0,600.00,600.00,2019-04-08 17:11:00,"177 Jackson St, Los Angeles, CA 90001"
18285,194091,AA Batteries (4-pack),1.0,3.84,3.84,2019-04-15 16:02:00,"311 Forest St, Austin, TX 73301"
18286,194092,AAA Batteries (4-pack),2.0,2.99,5.98,2019-04-28 14:36:00,"347 Sunset St, San Francisco, CA 94016"
18287,194093,AA Batteries (4-pack),1.0,3.84,3.84,2019-04-14 15:09:00,"835 Lake St, Portland, OR 97035"


# Using Geopy to get location data

In [11]:
#define a function to extract city and state names from text address using geopy
from geopy.geocoders import Nominatim
#function to get city and state name from string address using geopy
def get_city_state(address_str):
    #user_agent = API key, Nominatim is the geocoding API of OSM
    locator = Nominatim(user_agent="myGeocoder")
    #the string address to be geocoded
    #geocoding the string address to get the latitude and longitude
    location = locator.geocode(address_str)
    #concatenating the lat and long into a variable
    #the location.raw method returns a dictionary containing latitude, longitude, and other information check cell below for a demo
    latlong = location.raw['lat'] + ", " + location.raw['lon']
    #using the reverse method to obtain latitude and longitude
    address_re = locator.reverse(latlong, language='en').raw
    #storing the address dictionary in 'address_in' variable
    address_in = (address_re.get('address'))
    #concatenating the 'city' and 'state' into one variable
    city_state = address_in['city'] + " " + address_in['state']
    return city_state

In [13]:
#demonstration of the location.raw
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode('917 1st St, Dallas, TX 75001')
location.raw

{'place_id': 100258207,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 9961101,
 'boundingbox': ['32.760958', '32.7609612', '-96.815612', '-96.8128167'],
 'lat': '32.760958',
 'lon': '-96.8141655',
 'display_name': '1st Street, Dallas, Dallas County, Texas, 75203, United States',
 'class': 'highway',
 'type': 'residential',
 'importance': 0.41000000000000003}

In [22]:
get_city_state(df_april_sales['Purchase Address'][1])

'Boston Massachusetts'

### Not relevant at the moment. Just tinkering

In [8]:
#product count
df_april_sales.groupby(df_april_sales['Product'])['Product'].count()

Product
20in Monitor                   390
27in 4K Gaming Monitor         563
27in FHD Monitor               734
34in Ultrawide Monitor         650
AA Batteries (4-pack)         2063
AAA Batteries (4-pack)        1989
Apple Airpods Headphones      1515
Bose SoundSport Headphones    1280
Flatscreen TV                  458
Google Phone                   581
LG Dryer                        77
LG Washing Machine              61
Lightning Charging Cable      2201
Macbook Pro Laptop             453
Product                         35
ThinkPad Laptop                392
USB-C Charging Cable          2079
Vareebadd Phone                220
Wired Headphones              1890
iPhone                         693
Name: Product, dtype: int64

In [9]:
#number of unique products
df_april_sales['Product'].nunique()

20