In [15]:
# Required libraries
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns

import plotly.graph_objects as go


In [2]:
# Load data assuming CSV files are in the same directory
distribution_centers = pd.read_csv("./data/distribution_centers.csv")
users = pd.read_csv("./data/users.csv")

## Data cleaning

#### Users

In [3]:
users.head()

Unnamed: 0,id,first_name,last_name,email,age,gender,state,street_address,postal_code,city,country,latitude,longitude,traffic_source,created_at
0,457,Timothy,Bush,timothybush@example.net,65,M,Acre,87620 Johnson Hills,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2022-07-19 13:51:00+00:00
1,6578,Elizabeth,Martinez,elizabethmartinez@example.com,34,F,Acre,1705 Nielsen Land,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2023-11-08 18:49:00+00:00
2,36280,Christopher,Mendoza,christophermendoza@example.net,13,M,Acre,125 Turner Isle Apt. 264,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Email,2019-08-24 06:10:00+00:00
3,60193,Jimmy,Conner,jimmyconner@example.com,64,M,Acre,0966 Jose Branch Apt. 008,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2020-02-15 11:26:00+00:00
4,64231,Natasha,Wilson,natashawilson@example.net,25,F,Acre,20798 Phillip Trail Apt. 392,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2020-03-13 06:45:00+00:00


In [4]:
# Check for missing values in the DataFrame
missing_values = users.isnull().sum()

# print result
print("Missing Values:")
print(missing_values)

Missing Values:
id                  0
first_name          0
last_name           0
email               0
age                 0
gender              0
state               0
street_address      0
postal_code         0
city              958
country             0
latitude            0
longitude           0
traffic_source      0
created_at          0
dtype: int64


In [6]:
#replace missing with unknown
users['city'] = users['city'].fillna('Unknown')
users.isnull().sum()

id                0
first_name        0
last_name         0
email             0
age               0
gender            0
state             0
street_address    0
postal_code       0
city              0
country           0
latitude          0
longitude         0
traffic_source    0
created_at        0
dtype: int64

In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              100000 non-null  int64  
 1   first_name      100000 non-null  object 
 2   last_name       100000 non-null  object 
 3   email           100000 non-null  object 
 4   age             100000 non-null  int64  
 5   gender          100000 non-null  object 
 6   state           100000 non-null  object 
 7   street_address  100000 non-null  object 
 8   postal_code     100000 non-null  object 
 9   city            100000 non-null  object 
 10  country         100000 non-null  object 
 11  latitude        100000 non-null  float64
 12  longitude       100000 non-null  float64
 13  traffic_source  100000 non-null  object 
 14  created_at      100000 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 11.4+ MB


In [None]:
# Check for duplicate rows 
duplicate_rows = users.duplicated()

# Count the number of duplicate rows
num_duplicate_rows = duplicate_rows.sum()

# print result
print("Number of Duplicate Rows:", num_duplicate_rows)


Number of Duplicate Rows: 0


#### distribution_centers

In [8]:
distribution_centers.head()

Unnamed: 0,id,name,latitude,longitude
0,1,Memphis TN,35.1174,-89.9711
1,2,Chicago IL,41.8369,-87.6847
2,3,Houston TX,29.7604,-95.3698
3,4,Los Angeles CA,34.05,-118.25
4,5,New Orleans LA,29.95,-90.0667


##### Here we can see distribution_centers data are clean and there is no missing values 

## Location Analysis

In [9]:
# Convert 'created_at' column to datetime format
users['created_at'] = pd.to_datetime(users['created_at'], format='ISO8601')
users.head()

Unnamed: 0,id,first_name,last_name,email,age,gender,state,street_address,postal_code,city,country,latitude,longitude,traffic_source,created_at
0,457,Timothy,Bush,timothybush@example.net,65,M,Acre,87620 Johnson Hills,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2022-07-19 13:51:00+00:00
1,6578,Elizabeth,Martinez,elizabethmartinez@example.com,34,F,Acre,1705 Nielsen Land,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2023-11-08 18:49:00+00:00
2,36280,Christopher,Mendoza,christophermendoza@example.net,13,M,Acre,125 Turner Isle Apt. 264,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Email,2019-08-24 06:10:00+00:00
3,60193,Jimmy,Conner,jimmyconner@example.com,64,M,Acre,0966 Jose Branch Apt. 008,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2020-02-15 11:26:00+00:00
4,64231,Natasha,Wilson,natashawilson@example.net,25,F,Acre,20798 Phillip Trail Apt. 392,69917-400,Rio Branco,Brasil,-9.945568,-67.83561,Search,2020-03-13 06:45:00+00:00


In [10]:
#extract year and time 
users['year_of_creation'] = users['created_at'].dt.year
users['date_of_creation'] = users['created_at'].dt.date
users['time_of_creation'] = users['created_at'].dt.time

In [11]:
import plotly.express as px

# Sort the DataFrame by 'year_of_creation' to ensure animation frame is in ascending order
users_sorted = users.sort_values(by='year_of_creation')
#plot
fig = px.scatter_geo(users_sorted, 
                     lat="latitude", 
                     lon="longitude",
                     color="country",  # You can choose any categorical column here
                     animation_frame="year_of_creation",
                     projection="natural earth",
                     title="User Distribution Over Years")
fig.show()


In [12]:
# Distribution of users across different states
state_counts = users['state'].value_counts()
city_counts = users['city'].value_counts()
country_counts = users['country'].value_counts()

print("Number of Users by State:")
print(state_counts.head())

print("\nNumber of Users by City:")
print(city_counts.head())

print("\nNumber of Users by Country:")
print(country_counts.head())

Number of Users by State:
state
Guangdong     5380
England       4034
California    3704
Shanghai      2499
Texas         2468
Name: count, dtype: int64

Number of Users by City:
city
Shanghai    2525
Beijing     2175
Seoul       1483
Shenzhen    1320
Unknown      958
Name: count, dtype: int64

Number of Users by Country:
country
China            34150
United States    22522
Brasil           14507
South Korea       5316
France            4700
Name: count, dtype: int64


In [13]:
# Create DataFrame for state counts
state_counts_df = pd.DataFrame({'State': state_counts.index, 'Number of Users': state_counts.values})

# Create DataFrame for city counts
city_counts_df = pd.DataFrame({'City': city_counts.index, 'Number of Users': city_counts.values})

# Create DataFrame for country counts
country_counts_df = pd.DataFrame({'Country': country_counts.index, 'Number of Users': country_counts.values})

# Plot bar chart for state counts
fig_state = px.bar(state_counts_df, x='State', y='Number of Users', title='Number of Users by State')

# Plot bar chart for city counts
fig_city = px.bar(city_counts_df.head(10), x='City', y='Number of Users', title='Number of Users by City')

# Plot bar chart for country counts
fig_country = px.bar(country_counts_df.head(10), x='Country', y='Number of Users', title='Number of Users by Country')

# Show the plots
fig_state.show()
fig_city.show()
fig_country.show()

### Distribution Centers Dataset EDA:

In [16]:
# Plotting the distrubtion centers

fig = go.Figure(data=go.Scattergeo(
    lon = distribution_centers['longitude'],
    lat = distribution_centers['latitude'],
    text = distribution_centers['name'],
    mode = 'markers',
    marker = dict(size= 20),))
        

fig.update_layout(title_text = 'Locations of All Distribution Centers',
                  geo_scope = 'usa',
                 height = 650,)

fig.show()