#                                    Sample Dataset

Creating Sample Data

In [11]:
import pandas as pd
import numpy as np


data = {
    'age': [23, 45, 31, np.nan, 29, 41, 54, np.nan, 38, 27, 30, 120, 25, np.nan, 48, 38, 29, 150, 33, 40],
    'salary': [50000, 60000, 52000, 48000, 55000, 62000, np.nan, 58000, 54000, 51000, 500000, 62000, np.nan, 60000, 65000, 59000, 53000, 45000, np.nan, 61000],
    'exp': [1, 10, 4, 2, 5, 11, 20, np.nan, 6, 3, 2, 15, 1, np.nan, 9, 5, 3, 22, 4, 7]
}

df = pd.DataFrame(data)
print(df)


      age    salary   exp
0    23.0   50000.0   1.0
1    45.0   60000.0  10.0
2    31.0   52000.0   4.0
3     NaN   48000.0   2.0
4    29.0   55000.0   5.0
5    41.0   62000.0  11.0
6    54.0       NaN  20.0
7     NaN   58000.0   NaN
8    38.0   54000.0   6.0
9    27.0   51000.0   3.0
10   30.0  500000.0   2.0
11  120.0   62000.0  15.0
12   25.0       NaN   1.0
13    NaN   60000.0   NaN
14   48.0   65000.0   9.0
15   38.0   59000.0   5.0
16   29.0   53000.0   3.0
17  150.0   45000.0  22.0
18   33.0       NaN   4.0
19   40.0   61000.0   7.0


# Outlier Analysis
**Defintion:**

Outlier analysis involves identifying and treating data points that are significantly different from the rest of the dataset. Outliers can distort statistical analyses and models, so it's important to detect and address them.

**1. IQR (Interquartile Range) Method:**

Definition: The IQR method identifies outliers by measuring the spread of the middle 50% of the data. Outliers are data points that lie below the first quartile (Q1) or above the third quartile (Q3) by a certain margin.

Formulae:

IQR = Q3 - Q1

Lower bound = Q1 -1.5*IQR

Upper bound = Q3 + 1.5*IQR

In [19]:
def detect_outliers_iqr(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    Lower_bound = Q1 - 1.5 * IQR
    Upper_bound = Q3 + 1.5 * IQR
    outliers = col[(col < Lower_bound) | (col > Upper_bound)]
    return outliers

outliers_age = detect_outliers_iqr(df['age'])
outliers_salary = detect_outliers_iqr(df['salary'])
outliers_exp = detect_outliers_iqr(df['exp'])

print("Outliers detected using IQR:")
print(f"Age outliers:\n{outliers_age}\n")
print(f"Salary outliers:\n{outliers_salary}\n")
print(f"Experience outliers:\n{outliers_exp}\n")


Outliers detected using IQR:
Age outliers:
11    120.0
17    150.0
Name: age, dtype: float64

Salary outliers:
10    500000.0
Name: salary, dtype: float64

Experience outliers:
6     20.0
17    22.0
Name: exp, dtype: float64



**Explanation**: This code calculates the IQR and uses it to define lower and upper bounds. Data points outside these bounds are considered outliers.

**2. Z-Score Method**

**Definition:**
The Z-score method identifies outliers by measuring how many standard deviations a data point is from the mean. A data point with a Z-score greater than a certain threshold (usually ±3) is considered an outlier.

In [14]:
from scipy import stats
def detect_outliers_zscore(col):
    z_scores = np.abs(stats.zscore(col.dropna()))
    threshold = 3

    outliers = col[(z_scores > threshold).reindex_like(col).fillna(False)]
    return outliers

outliers_age_z = detect_outliers_zscore(df['age'])
outliers_salary_z = detect_outliers_zscore(df['salary'])
outliers_exp_z = detect_outliers_zscore(df['exp'])

print("Outliers detected using Z-Score:")
print(f"Age outliers:\n{outliers_age_z}\n")
print(f"Salary outliers:\n{outliers_salary_z}\n")
print(f"Experience outliers:\n{outliers_exp_z}\n")


Outliers detected using Z-Score:
Age outliers:
17    150.0
Name: age, dtype: float64

Salary outliers:
10    500000.0
Name: salary, dtype: float64

Experience outliers:
Series([], Name: exp, dtype: float64)



**Explanation:**
This code computes the Z-scores for each data point in the specified column and identifies those that have a Z-score greater than the threshold as outliers.

# CSV Dataset

In [21]:
from google.colab import files
uploaded = files.upload()
df2=pd.read_csv('Toyota.csv')
print(df2)

Saving Toyota.csv to Toyota (2).csv
      Unnamed: 0  Price   Age     KM FuelType   HP  MetColor  Automatic    CC  \
0              0  13500  23.0  46986   Diesel   90       1.0          0  2000   
1              1  13750  23.0  72937   Diesel   90       1.0          0  2000   
2              2  13950  24.0  41711   Diesel   90       NaN          0  2000   
3              3  14950  26.0  48000   Diesel   90       0.0          0  2000   
4              4  13750  30.0  38500   Diesel   90       0.0          0  2000   
...          ...    ...   ...    ...      ...  ...       ...        ...   ...   
1431        1431   7500   NaN  20544   Petrol   86       1.0          0  1300   
1432        1432  10845  72.0     ??   Petrol   86       0.0          0  1300   
1433        1433   8500   NaN  17016   Petrol   86       0.0          0  1300   
1434        1434   7250  70.0     ??      NaN   86       1.0          0  1300   
1435        1435   6950  76.0      1   Petrol  110       0.0          0  

1.IQR

In [24]:
import pandas as pd

# Load dataset
df2 = pd.read_csv('Toyota.csv')

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df2['Price'].quantile(0.25)
Q3 = df2['Price'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df2[(df2['Price'] < lower_bound) | (df2['Price'] > upper_bound)]

print(f"Outliers:\n{outliers}")


Outliers:
     Unnamed: 0  Price   Age     KM FuelType   HP  MetColor  Automatic    CC  \
7             7  18600  30.0  75889      NaN   90       1.0          0  2000   
8             8  21500  27.0  19700   Petrol  192       0.0          0  1800   
10           10  20950  25.0  31461   Petrol  192       0.0          0  1800   
11           11  19950  22.0  43610   Petrol  192       0.0          0  1800   
12           12  19600  25.0  32189   Petrol  192       0.0          0  1800   
..          ...    ...   ...    ...      ...  ...       ...        ...   ...   
182         182  21125   2.0    225   Petrol   97       1.0          0  1400   
183         183  21500   NaN     15   Petrol  110       1.0          0  1600   
184         184  17795   1.0      1   Petrol   98       1.0          0  1400   
185         185  18245   1.0      1   Petrol  110       1.0          0  1600   
523         523  18950  49.0  49568   Petrol  110       1.0          0  1600   

    Doors  Weight  
7       3

2.Z score

In [25]:
from scipy import stats
import numpy as np

# Calculate Z-scores
z_scores = np.abs(stats.zscore(data['Price']))

# Define threshold for identifying outliers
threshold = 3

# Identify outliers
outliers = df2[z_scores > threshold]

print(f"Outliers:\n{outliers}")


Outliers:
     Unnamed: 0  Price   Age     KM FuelType    HP  MetColor  Automatic    CC  \
14           14  22500  32.0  34131   Petrol   192       1.0          0  1800   
15           15  22000  28.0  18739   Petrol  ????       0.0          0  1800   
16           16  22750  30.0  34000   Petrol   192       1.0          0  1800   
49           49  21950  31.0  64982   Petrol   192       NaN          0  1800   
53           53  21950  27.0  49866   Petrol   192       1.0          0  1800   
68           68  22250  22.0  30000   Diesel   110       1.0          0  2000   
89           89  21950  19.0  50005   Diesel   110       1.0          0  2000   
91           91  22250  20.0     ??   Diesel    90       1.0          0  2000   
109         109  32500   NaN      1      NaN   116       0.0          0  2000   
110         110  31000   4.0   4000   Diesel   116       1.0          0  2000   
111         111  31275   4.0   1500   Diesel   116       1.0          0  2000   
112         112  2