In [1]:


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'uber-fares-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1834623%2F2994100%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240327%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240327T060704Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3a1a814e5f570f24e77346860934b87df0ba349a01ac309db515a6dd303672cd1d7ca1fa48f6cc48398f349337637728c40647633d1a82a9e73677fcecde8ea34c8d69d69e7409ed5e90df08f5b80ec8351f1ca0382d9188c358b430e9897b7cb7c698be15e2d70c50e4c0ce3108abef554bf1915d670432a243f4f8b1b3d82cf571bc957242acfd28342a471ded3173bd6136e826f4dc595f6a74f37d55dfaf9ff781b308f9d141fbb4242244a4347d88883ab86ed393e2769d2df6381fcefc119bf572504abe40f78a8bdc3969a301074d6802caa63c744b7258488c0ba0ea8682dba3d9b0d90b5d68dbfc0b15a819a641a8125e05a62ce4752b17547455b9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


The system cannot find the path specified.


OSError: [WinError 1314] A required privilege is not held by the client: '/kaggle/input' -> '..\\input'

In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uber-fares-dataset/uber.csv


**Okay so here I am trying to detect outliers present in the data.**

In [7]:
data=pd.read_csv("/kaggle/input/uber-fares-dataset/uber.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


**In the below code we are just removing the unnecessary columns from the data to avoid complexity**

In [8]:
extracol=["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
data= data.drop(columns=(extracol))
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,5


**Using the describe method can help us to understand the data more**

In [9]:
data.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,passenger_count
count,200000.0,200000.0,200000.0
mean,27712500.0,11.359955,1.684535
std,16013820.0,9.901776,1.385997
min,1.0,-52.0,0.0
25%,13825350.0,6.0,1.0
50%,27745500.0,8.5,1.0
75%,41555300.0,12.5,2.0
max,55423570.0,499.0,208.0


In [10]:
data.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,passenger_count
count,200000.0,200000.0,200000.0
mean,27712500.0,11.359955,1.684535
std,16013820.0,9.901776,1.385997
min,1.0,-52.0,0.0
25%,13825350.0,6.0,1.0
50%,27745500.0,8.5,1.0
75%,41555300.0,12.5,2.0
max,55423570.0,499.0,208.0


In [11]:
#CHEKING THE DISTRIBUTION
import plotly.express as px

fig=px.histogram(data,  x="fare_amount")
fig.show()


So we can see that the data does not follow the normal distribution . It is left skewed.

So now instead of Z SCORE we have to use Interqartile Range to define the outlier.

So for this purpose we can use **BOX PLOT** to see the max , min range of the data of a specific variable. here **FARE_AMOUNT**


In [12]:
fig2=px.box(data,y="fare_amount")
fig2.show()

**MULTIVARIATE ANALYSIS USING *scatter plot***

In [13]:
fig3 =px.scatter(x=data["passenger_count"],y=data["fare_amount"])
fig3.show()

Statistical way to find the outlier IQR

In [25]:
#creating a function to find outliers using IQR

def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [15]:
outliers= find_outliers_IQR(data["fare_amount"])
print("OUTLIERS SUMMARY OF FARE_AMOUNT")
print("Count of outliers: ",len(outliers))
print("Max value is: " ,max(outliers))
print("Min value is: ", min(outliers))
outliers

OUTLIERS SUMMARY OF FARE_AMOUNT
Count of outliers:  17167
Max value is:  499.0
Min value is:  -52.0


6         24.50
30        25.70
34        39.50
39        29.00
48        56.80
          ...  
199976    49.70
199977    43.50
199982    57.33
199985    24.00
199997    30.90
Name: fare_amount, Length: 17167, dtype: float64

In [16]:
outliers= find_outliers_IQR(data["passenger_count"])
print("OUTLIERS SUMMARY OF PASSENGER_COUNT")
print("Count of outliers: ",len(outliers))
print("Max value is: " ,max(outliers))
print("Min value is: ", min(outliers))
outliers

OUTLIERS SUMMARY OF PASSENGER_COUNT
Count of outliers:  22557
Max value is:  208
Min value is:  4


4         5
6         5
12        5
24        5
29        5
         ..
199958    5
199959    5
199962    4
199969    5
199985    5
Name: passenger_count, Length: 22557, dtype: int64

Techniques we are using  to handle outliers

* Cap the outliers
* Replace outliers using imputation as if they were missing values

**Since the number of the outliers is more so the Dropping off the outliers will affect the overall data**


***Cap the outliers***

*Capping the outliers means setting the max value and min values
so that anything beyond those points will be that points itself.*

**UPPER_LIMIT = df[column].mean()+3*df[column].std()**

**LOWER_LIMIT = df[column].mean()-3*df[column].std()**

In [17]:
upper_limit=data["fare_amount"].mean() + (3*data["fare_amount"].std())
lower_limit=data["fare_amount"].mean()-(3*data["fare_amount"].std())

print("Upper Limit: ", upper_limit)
print("Lower Limit: ",lower_limit)


Upper Limit:  41.0652839252097
Lower Limit:  -18.345373425209697


In [18]:
data["fare_amount"] = np.where(data["fare_amount"] > upper_limit,upper_limit,
                               np.where(data["fare_amount"] < lower_limit,lower_limit,data["fare_amount"] ))

**np.where**

Syntax :numpy.where(condition[, x, y])
Parameters:
condition : When True, yield x, otherwise yield y.
x, y : Values from which to choose. x, y and condition need to be broadcastable to some shape.

In [19]:
data["fare_amount"].describe()

count    200000.000000
mean         11.008988
std           8.088084
min         -18.345373
25%           6.000000
50%           8.500000
75%          12.500000
max          41.065284
Name: fare_amount, dtype: float64

In [20]:
upper_limit=data["passenger_count"].mean() + (3*data["passenger_count"].std())
lower_limit=data["passenger_count"].mean()-(3*data["passenger_count"].std())

print("Upper Limit: ", upper_limit)
print("Lower Limit: ",lower_limit)

Upper Limit:  5.842524652267636
Lower Limit:  -2.473454652267636


In [21]:
data["passenger_count"] = np.where(data["passenger_count"] > upper_limit,upper_limit,
                               np.where(data["passenger_count"] < lower_limit,lower_limit,data["passenger_count"] ))

In [22]:
data["passenger_count"].describe()

count    200000.000000
mean          1.680161
std           1.296043
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           5.842525
Name: passenger_count, dtype: float64

In [23]:
import plotly.express as px

fig=px.histogram(data,  x="fare_amount")
fig.show()


In [24]:
fig3 =px.scatter(x=data["passenger_count"],y=data["fare_amount"])
fig3.show()

So we can see that we have removed and reduced unevenly large or small outliers. The data is almost normally distrubuted and now can be further used for analysis.


Please leave a comment if you see any error or want some of the changes.

Thank you:-)