# Statistical Analysis and EDA of GA Dataset

This notebook contains:
1. Data Loading and Initial Exploration
2. Exploratory Data Analysis (EDA)
3. Statistical Analysis
4. Visualization with Plotly

In [10]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [11]:
# Load the dataset
df = pd.read_csv('data\Ga.csv')

# Display first few rows
print("First few rows of the dataset:")
display(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
df.info()

First few rows of the dataset:


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ride_id             1000 non-null   object 
 1   rideable_type       1000 non-null   object 
 2   started_at          1000 non-null   object 
 3   ended_at            1000 non-null   object 
 4   start_station_name  997 non-null    object 
 5   start_station_id    997 non-null    object 
 6   end_station_name    998 non-null    object 
 7   end_station_id      998 non-null    object 
 8   start_lat           1000 non-null   float64
 9   start_lng           1000 non-null   float64
 10  end_lat             1000 non-null   float64
 11  end_lng             1000 non-null   float64
 12  member_casual       1000 non-null   object 
dtypes: float64(4), object(9)
memory usage: 101.7+ KB


## 1. Data Overview and Basic Statistics
Let's examine the basic statistics and properties of our dataset.

In [12]:
# Get basic statistics
print("Basic Statistics:")
display(df.describe())

# Check for missing values
print("\nMissing Values:")
display(df.isnull().sum())

# Get data types
print("\nData Types:")
display(df.dtypes)

Basic Statistics:


Unnamed: 0,start_lat,start_lng,end_lat,end_lng
count,1000.0,1000.0,1000.0,1000.0
mean,41.896105,-87.645848,41.897251,-87.645785
std,0.044966,0.025849,0.045431,0.026276
min,41.69,-87.76,41.69,-87.76
25%,41.87785,-87.666611,41.880394,-87.66402
50%,41.894345,-87.641823,41.894666,-87.643724
75%,41.917095,-87.627834,41.920196,-87.629912
max,42.057044,-87.565688,42.057044,-87.565688



Missing Values:


ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    3
start_station_id      3
end_station_name      2
end_station_id        2
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64


Data Types:


ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

## 2. Exploratory Data Analysis (EDA)
Let's analyze the distribution of variables and their relationships.

In [13]:
# Function to plot distribution
def plot_distribution(df, column):
    fig = make_subplots(rows=1, cols=2,
                       subplot_titles=('Distribution Plot', 'Box Plot'))
    
    # Histogram
    fig.add_trace(
        go.Histogram(x=df[column], name='Distribution'),
        row=1, col=1
    )
    
    # Box plot
    fig.add_trace(
        go.Box(y=df[column], name='Box Plot'),
        row=1, col=2
    )
    
    fig.update_layout(
        title=f'Distribution Analysis of {column}',
        showlegend=False,
        height=400
    )
    fig.show()

# Plot distributions for numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols[:3]:  # First 3 columns as example
    plot_distribution(df, col)

## 3. Correlation Analysis
Let's analyze the relationships between variables.

In [14]:
# Calculate correlation matrix
correlation_matrix = df.select_dtypes(include=['int64', 'float64']).corr()

# Create heatmap using plotly
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmin=-1,
    zmax=1
))

fig.update_layout(
    title='Correlation Heatmap',
    height=600,
    width=800
)
fig.show()

## 4. Time Series Analysis (if applicable)
If there are any date/time columns, let's analyze trends over time.

In [15]:
# Check if there are any datetime columns
date_cols = df.select_dtypes(include=['datetime64']).columns
if len(date_cols) > 0:
    # Plot time series
    for col in numerical_cols[:2]:  # First 2 numerical columns
        fig = px.line(df, x=date_cols[0], y=col, title=f'{col} Over Time')
        fig.show()

## 5. Statistical Tests
Performing basic statistical tests on the data.

In [16]:
# Function for normality test
def test_normality(data, column):
    stat, p_value = stats.normaltest(data[column].dropna())
    return {
        'Column': column,
        'Statistic': stat,
        'P-value': p_value,
        'Is Normal': p_value > 0.05
    }

# Perform normality tests
normality_results = []
for col in numerical_cols:
    result = test_normality(df, col)
    normality_results.append(result)

# Display results
display(pd.DataFrame(normality_results))

Unnamed: 0,Column,Statistic,P-value,Is Normal
0,start_lat,32.156646,1.040574e-07,False
1,start_lng,19.296017,6.455401e-05,False
2,end_lat,34.227012,3.695718e-08,False
3,end_lng,13.538664,0.001148462,False


## 6. Advanced Visualizations
Creating advanced visualizations to better understand the data patterns.

In [17]:
# Create scatter matrix for numerical columns
fig = px.scatter_matrix(
    df,
    dimensions=numerical_cols[:4],  # First 4 numerical columns
    title='Scatter Matrix of Numerical Variables'
)
fig.update_layout(height=800, width=800)
fig.show()

# Create box plots for numerical columns
fig = go.Figure()
for col in numerical_cols[:5]:  # First 5 numerical columns
    fig.add_trace(go.Box(y=df[col], name=col))
fig.update_layout(title='Box Plots of Numerical Variables')
fig.show()

## 7. Summary Statistics by Category (if applicable)
If categorical variables are present, let's analyze numerical variables across categories.

In [18]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

if len(categorical_cols) > 0:
    # For each categorical column, create summary statistics
    for cat_col in categorical_cols:
        # Create summary statistics
        summary = df.groupby(cat_col)[numerical_cols[:3]].agg(['mean', 'std', 'count'])
        print(f"\nSummary Statistics by {cat_col}:")
        display(summary)
        
        # Create box plots
        fig = px.box(df, x=cat_col, y=numerical_cols[0],
                    title=f'Distribution of {numerical_cols[0]} by {cat_col}')
        fig.show()


Summary Statistics by ride_id:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
ride_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
00597F0DED9117DA,41.911378,,1,-87.638656,,1,41.903486,,1
0067EBB4F707DB87,41.894722,,1,-87.634362,,1,41.898969,,1
006893A5766A2FF3,41.883984,,1,-87.624684,,1,41.882830,,1
006A7326570740AE,41.886021,,1,-87.630876,,1,41.897448,,1
006D3F363CF9600B,41.967962,,1,-87.650029,,1,41.994780,,1
...,...,...,...,...,...,...,...,...,...
FEDE668B9470DF20,41.989743,,1,-87.660141,,1,41.990860,,1
FF0769D37B5F2484,41.910213,,1,-87.643490,,1,41.888243,,1
FF0D73A7528C5D55,41.886021,,1,-87.630876,,1,41.896747,,1
FF94C6476E228A26,41.931902,,1,-87.701195,,1,41.910440,,1



Summary Statistics by rideable_type:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
rideable_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
classic_bike,41.892746,0.046802,675,-87.64271,0.025135,675,41.893546,0.047194,675
docked_bike,41.88414,0.039052,12,-87.637831,0.035183,12,41.88339,0.03729,12
electric_bike,41.903807,0.039987,313,-87.652921,0.025631,313,41.905772,0.040475,313



Summary Statistics by started_at:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
started_at,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2022-01-01 00:28:29,41.773458,,1,-87.585340,,1,41.769293,,1
2022-01-01 00:41:21,41.913863,,1,-87.648701,,1,41.952833,,1
2022-01-01 00:42:45,41.895634,,1,-87.672069,,1,41.920196,,1
2022-01-01 00:53:54,41.863764,,1,-87.623619,,1,41.845687,,1
2022-01-01 01:39:11,41.834734,,1,-87.625813,,1,41.836208,,1
...,...,...,...,...,...,...,...,...,...
2022-01-31 19:08:48,41.867227,,1,-87.625961,,1,41.892278,,1
2022-01-31 19:23:13,41.907655,,1,-87.672552,,1,41.912616,,1
2022-01-31 19:40:24,41.983560,,1,-87.669121,,1,42.020887,,1
2022-01-31 19:59:02,41.867118,,1,-87.641088,,1,41.878119,,1



Summary Statistics by ended_at:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
ended_at,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2022-01-01 00:42:13,41.773458,,1,-87.585340,,1,41.769293,,1
2022-01-01 00:57:30,41.913863,,1,-87.648701,,1,41.952833,,1
2022-01-01 00:57:44,41.895634,,1,-87.672069,,1,41.920196,,1
2022-01-01 01:01:03,41.863764,,1,-87.623619,,1,41.845687,,1
2022-01-01 01:45:41,41.834734,,1,-87.625813,,1,41.836208,,1
...,...,...,...,...,...,...,...,...,...
2022-01-31 19:28:11,41.907655,,1,-87.672552,,1,41.912616,,1
2022-01-31 19:30:07,41.867227,,1,-87.625961,,1,41.892278,,1
2022-01-31 19:56:48,41.983560,,1,-87.669121,,1,42.020887,,1
2022-01-31 20:05:53,41.867118,,1,-87.641088,,1,41.878119,,1



Summary Statistics by start_station_name:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
start_station_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2112 W Peterson Ave,41.991175,,1,-87.683632,,1,41.983593,,1
900 W Harrison St,41.874779,,1,-87.649842,,1,41.894666,,1
Aberdeen St & Jackson Blvd,41.877726,,1,-87.654787,,1,41.880419,,1
Aberdeen St & Monroe St,41.880418,0.000003,5,-87.655551,0.000071,5,41.888428,0.009641,5
Aberdeen St & Randolph St,41.884114,0.000000,4,-87.654264,0.000000,4,41.892287,0.004542,4
...,...,...,...,...,...,...,...,...,...
Wood St & 35th St,41.830141,0.000052,2,-87.670319,0.000029,2,41.834151,0.005723,2
Wood St & Chicago Ave,41.895646,0.000026,5,-87.672084,0.000034,5,41.894396,0.017420,5
Wood St & Hubbard St,41.889860,0.000052,4,-87.671540,0.000050,4,41.882192,0.028607,4
Wood St & Milwaukee Ave,41.907629,0.000040,5,-87.672491,0.000117,5,41.912664,0.024680,5



Summary Statistics by start_station_id:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
start_station_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
13001,41.883917,0.000163,6,-87.624523,0.000393,6,41.882049,0.002944,6
13006,41.882650,0.000101,5,-87.632387,0.000157,5,41.907764,0.017716,5
13008,41.881033,0.000004,8,-87.624078,0.000017,8,41.883029,0.008646,8
13016,41.894325,0.000097,17,-87.622785,0.000129,17,41.891244,0.007445,17
13021,41.885586,0.000115,15,-87.641824,0.000060,15,41.887138,0.010256,15
...,...,...,...,...,...,...,...,...,...
TA1309000059,41.928887,,1,-87.658971,,1,41.939743,,1
TA1309000063,41.793242,0.000000,5,-87.587782,0.000000,5,41.795306,0.002583,5
TA1309000064,41.871271,0.000029,10,-87.673684,0.000012,10,41.876739,0.009620,10
TA1309000067,41.803038,,1,-87.606615,,1,41.799568,,1



Summary Statistics by end_station_name:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
end_station_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
63rd St Beach,41.766409,,1,-87.565688,,1,41.780911,,1
Aberdeen St & Monroe St,41.883253,0.006248,20,-87.640035,0.009178,20,41.880419,0.0,20
Ada St & Washington Blvd,41.878897,0.007348,15,-87.654993,0.022509,15,41.882830,0.0,15
Ashland Ave & Belle Plaine Ave,41.949596,0.007655,5,-87.690363,0.033424,5,41.956057,0.0,5
Ashland Ave & Blackhawk St,41.902810,0.019708,3,-87.661255,0.004641,3,41.907066,0.0,3
...,...,...,...,...,...,...,...,...,...
Wolcott Ave & Polk St,41.861207,,1,-87.656600,,1,41.871262,,1
Wood St & 35th St,41.830441,0.000475,2,-87.651401,0.026725,2,41.830105,0.0,2
Wood St & Augusta Blvd,41.903295,0.016580,4,-87.665510,0.023314,4,41.899181,0.0,4
Wood St & Taylor St (Temp),41.861267,,1,-87.656625,,1,41.869265,,1



Summary Statistics by end_station_id:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
end_station_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
13001,41.888999,0.017347,8,-87.635855,0.015471,8,41.883984,0.0,8
13008,41.883953,0.004132,2,-87.625057,0.001376,2,41.881032,0.0,2
13016,41.893847,0.016792,11,-87.636923,0.015729,11,41.894345,0.0,11
13017,41.891609,0.007593,19,-87.633446,0.009686,19,41.896747,0.0,19
13021,41.888821,0.007954,14,-87.639839,0.010839,14,41.885637,0.0,14
...,...,...,...,...,...,...,...,...,...
TA1309000055,41.838862,0.018931,11,-87.637377,0.017116,11,41.838198,0.0,11
TA1309000063,41.795392,0.005643,17,-87.593335,0.008068,17,41.793242,0.0,17
TA1309000064,41.861207,,1,-87.656600,,1,41.871262,,1
WL-008,41.867816,,1,-87.622981,,1,41.867118,,1



Summary Statistics by member_casual:


Unnamed: 0_level_0,start_lat,start_lat,start_lat,start_lng,start_lng,start_lng,end_lat,end_lat,end_lat
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
member_casual,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
casual,41.898472,0.044038,258,-87.650231,0.028956,258,41.899627,0.045041,258
member,41.895282,0.045284,742,-87.644324,0.024514,742,41.896424,0.045567,742


## 8. Outlier Analysis
Detecting and visualizing outliers in the dataset.

In [19]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return len(outliers), lower_bound, upper_bound

# Analyze outliers for numerical columns
outlier_results = []
for col in numerical_cols:
    n_outliers, lower, upper = detect_outliers(df, col)
    outlier_results.append({
        'Column': col,
        'Number of Outliers': n_outliers,
        'Lower Bound': lower,
        'Upper Bound': upper
    })

display(pd.DataFrame(outlier_results))

Unnamed: 0,Column,Number of Outliers,Lower Bound,Upper Bound
0,start_lat,114,41.818983,41.975962
1,start_lng,4,-87.724776,-87.569669
2,end_lat,106,41.82069,41.979899
3,end_lng,13,-87.715182,-87.57875
