In [1]:
# Question: Advanced Data Profiling and Outlier Detection
# Description: Perform detailed data profiling including outlier detection for numeric columns.




In [2]:
import pandas as pd
import numpy as np
from scipy import stats

# 1. Manually create a CSV file
data = {
    'Age': [25, 30, 22, 35, 28, 120, 27, 29, 31, 26],
    'Salary': [50000, 60000, 52000, 80000, 58000, 1000000, 55000, 62000, 61000, 59000],
    'Years_at_Company': [1, 3, 2, 5, 4, 20, 2, 3, 3, 1]
}

df = pd.DataFrame(data)
csv_filename = 'employees.csv'
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created.")

# 2. Read the CSV file
df = pd.read_csv(csv_filename)

# 3. Basic data profiling
print("\n=== DataFrame Head ===")
print(df.head())

print("\n=== DataFrame Info ===")
print(df.info())

print("\n=== Descriptive Statistics ===")
print(df.describe())

# 4. Detailed numeric profiling
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    col_data = df[col]
    print(f"\n--- Profiling '{col}' ---")
    print(f"Mean: {col_data.mean():.2f}")
    print(f"Median: {col_data.median():.2f}")
    print(f"Std Dev: {col_data.std():.2f}")
    print(f"Min: {col_data.min()}")
    print(f"Max: {col_data.max()}")
    print(f"Skewness: {col_data.skew():.2f}")
    print(f"Kurtosis: {col_data.kurtosis():.2f}")

# 5. Outlier Detection using Z-score
print("\n=== Outliers based on Z-score (|z| > 3) ===")
z_scores = np.abs(stats.zscore(df[numeric_cols]))
outliers_z = (z_scores > 3)
for i, row in enumerate(outliers_z):
    if row.any():
        outlier_cols = numeric_cols[row]
        print(f"Row {i} outlier in: {list(outlier_cols)}")

# 6. Outlier Detection using IQR method
print("\n=== Outliers based on IQR ===")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
    outlier_indices = df.index[outlier_mask].tolist()
    print(f"Column '{col}' outlier indices: {outlier_indices}")

# 7. Tag outliers in the DataFrame
df_out = df.copy()
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_out[f'{col}_outlier'] = df[col].apply(lambda x: x < lower_bound or x > upper_bound)

print("\n=== DataFrame with Outlier Flags ===")
print(df_out)


CSV file 'employees.csv' created.

=== DataFrame Head ===
   Age  Salary  Years_at_Company
0   25   50000                 1
1   30   60000                 3
2   22   52000                 2
3   35   80000                 5
4   28   58000                 4

=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Age               10 non-null     int64
 1   Salary            10 non-null     int64
 2   Years_at_Company  10 non-null     int64
dtypes: int64(3)
memory usage: 368.0 bytes
None

=== Descriptive Statistics ===
              Age          Salary  Years_at_Company
count   10.000000       10.000000         10.000000
mean    37.300000   153700.000000          4.400000
std     29.272854   297471.212426          5.621388
min     22.000000    50000.000000          1.000000
25%     26.250000    55750.000000          2.000000
50